src/5017007/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43                 = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47     goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  *      Copyright (c) 1986 by University of Toronto.
  54  *      Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  *      Permission is granted to anyone to use this software for any
  57  *      purpose on any computer system, and to redistribute it freely,
  58  *      subject to the following restrictions:
  59  *
  60  *      1. The author is not responsible for the consequences of use of
  61  *              this software, no matter how awful, even if they arise
  62  *              from defects in it.
  63  *
  64  *      2. The origin of this software must not be misrepresented, either
  65  *              by explicit claim or by omission.
  66  *
  67  *      3. Altered versions must be plainly marked as such, and must not
  68  *              be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86 #include "re_defs.h"
  87
  88 #ifdef PERL_IN_XSUB_RE
  89 #  include "re_comp.h"
  90 #else
  91 #  include "regcomp.h"
  92 #endif
  93
  94 #include "inline_invlist.c"
  95 #include "unicode_constants.h"
  96
  97 #define RF_tainted      1       /* tainted information used? e.g. locale */
  98 #define RF_warned       2               /* warned about big count? */
  99
 100 #define RF_utf8         8               /* Pattern contains multibyte chars? */
 101
 102 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
 103
 104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 105
 106 #ifndef STATIC
 107 #define STATIC  static
 108 #endif
 109
 110 /* Valid for non-utf8 strings: avoids the reginclass
 111  * call if there are no complications: i.e., if everything matchable is
 112  * straight forward in the bitmap */
 113 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 114                                               : ANYOF_BITMAP_TEST(p,*(c)))
 115
 116 /*
 117  * Forwards.
 118  */
 119
 120 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 121 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 122
 123 #define HOPc(pos,off) \
 124         (char *)(PL_reg_match_utf8 \
 125             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 126             : (U8*)(pos + off))
 127 #define HOPBACKc(pos, off) \
 128         (char*)(PL_reg_match_utf8\
 129             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 130             : (pos - off >= PL_bostr)           \
 131                 ? (U8*)pos - off                \
 132                 : NULL)
 133
 134 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 135 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 136
 137
 138 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 139 #define NEXTCHR_IS_EOS (nextchr < 0)
 140
 141 #define SET_nextchr \
 142     nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
 143
 144 #define SET_locinput(p) \
 145     locinput = (p);  \
 146     SET_nextchr
 147
 148
 149 /* these are unrolled below in the CCC_TRY_XXX defined */
 150 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 151     if (!CAT2(PL_utf8_,class)) { \
 152         bool ok; \
 153         ENTER; save_re_context(); \
 154         ok=CAT2(is_utf8_,class)((const U8*)str); \
 155         PERL_UNUSED_VAR(ok); \
 156         assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
 157 /* Doesn't do an assert to verify that is correct */
 158 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 159     if (!CAT2(PL_utf8_,class)) { \
 160         bool throw_away; \
 161         PERL_UNUSED_VAR(throw_away); \
 162         ENTER; save_re_context(); \
 163         throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
 164         PERL_UNUSED_VAR(throw_away); \
 165         LEAVE; } } STMT_END
 166
 167 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 168 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 169
 170 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 171         /* No asserts are done for some of these, in case called on a   */  \
 172         /* Unicode version in which they map to nothing */                  \
 173         LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8);                  \
 174         LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8);         \
 175
 176 #define PLACEHOLDER     /* Something for the preprocessor to grab onto */
 177
 178 /* The actual code for CCC_TRY, which uses several variables from the routine
 179  * it's callable from.  It is designed to be the bulk of a case statement.
 180  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 181  *      nextchr matches the class.
 182  * UTF8_TEST is the whole test string to use for utf8 targets
 183  * LOAD is what to use to test, and if not present to load in the swash for the
 184  *      class
 185  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 186  *      UTF8_TEST test.
 187  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 188  * utf8 and a variant, load the swash if necessary and test using the utf8
 189  * test.  Advance to the next character if test is ok, otherwise fail; If not
 190  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 191  * fails, or advance to the next character */
 192
 193 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 194     if (NEXTCHR_IS_EOS) {                                                     \
 195         sayNO;                                                                \
 196     }                                                                         \
 197     if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 198         LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 199         if (POS_OR_NEG (UTF8_TEST)) {                                         \
 200             sayNO;                                                            \
 201         }                                                                     \
 202     }                                                                         \
 203     else if (POS_OR_NEG (FUNC(nextchr))) {                                    \
 204             sayNO;                                                            \
 205     }                                                                         \
 206     goto increment_locinput;
 207
 208 /* Handle the non-locale cases for a character class and its complement.  It
 209  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 210  * This is because that code fails when the test succeeds, so we want to have
 211  * the test fail so that the code succeeds.  The swash is stored in a
 212  * predictable PL_ place */
 213 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 214                            CLASS, STR)                                        \
 215     case NAME:                                                                \
 216         _CCC_TRY_CODE( !, FUNC,                                               \
 217                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 218                                             (U8*)locinput, TRUE)),            \
 219                           CLASS, STR)                                         \
 220     case NNAME:                                                               \
 221         _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 222                           cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 223                                             (U8*)locinput, TRUE)),            \
 224                           CLASS, STR)
 225 /* Generate the case statements for both locale and non-locale character
 226  * classes in regmatch for classes that don't have special unicode semantics.
 227  * Locales don't use an immediate swash, but an intermediary special locale
 228  * function that is called on the pointer to the current place in the input
 229  * string.  That function will resolve to needing the same swash.  One might
 230  * think that because we don't know what the locale will match, we shouldn't
 231  * check with the swash loading function that it loaded properly; ie, that we
 232  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 233  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 234  * irrelevant here */
 235 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 236                 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 237                 NAMEA, NNAMEA, FUNCA,                                         \
 238                 CLASS, STR)                                                   \
 239     case NAMEL:                                                               \
 240         PL_reg_flags |= RF_tainted;                                           \
 241         _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 242     case NNAMEL:                                                              \
 243         PL_reg_flags |= RF_tainted;                                           \
 244         _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 245                        CLASS, STR)                                            \
 246     case NAMEA:                                                               \
 247         if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) {                      \
 248             sayNO;                                                            \
 249         }                                                                     \
 250         /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 251         locinput++;                                        \
 252         break;                                                                \
 253     case NNAMEA:                                                              \
 254         if (NEXTCHR_IS_EOS || FUNCA(nextchr)) {                        \
 255             sayNO;                                                            \
 256         }                                                                     \
 257         goto increment_locinput;                                              \
 258     /* Generate the non-locale cases */                                       \
 259     _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 260
 261 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 262  * statements to handle separate Unicode semantics nodes */
 263 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 264                   NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 265                   NAMEU, NNAMEU, FUNCU,                                        \
 266                   NAMEA, NNAMEA, FUNCA,                                        \
 267                   CLASS, STR)                                                  \
 268     CCC_TRY(NAME, NNAME, FUNC,                                                 \
 269             NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 270             NAMEA, NNAMEA, FUNCA,                                              \
 271             CLASS, STR)                                                        \
 272     _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 273
 274 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 275
 276 /* for use after a quantifier and before an EXACT-like node -- japhy */
 277 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 278  *
 279  * NOTE that *nothing* that affects backtracking should be in here, specifically
 280  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 281  * node that is in between two EXACT like nodes when ascertaining what the required
 282  * "follow" character is. This should probably be moved to regex compile time
 283  * although it may be done at run time beause of the REF possibility - more
 284  * investigation required. -- demerphq
 285 */
 286 #define JUMPABLE(rn) (      \
 287     OP(rn) == OPEN ||       \
 288     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 289     OP(rn) == EVAL ||   \
 290     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 291     OP(rn) == PLUS || OP(rn) == MINMOD || \
 292     OP(rn) == KEEPS || \
 293     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 294 )
 295 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 296
 297 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 298
 299 #if 0
 300 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 301    we don't need this definition. */
 302 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 303 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 304 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 305
 306 #else
 307 /* ... so we use this as its faster. */
 308 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 309 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 310 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 311 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 312
 313 #endif
 314
 315 /*
 316   Search for mandatory following text node; for lookahead, the text must
 317   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 318 */
 319 #define FIND_NEXT_IMPT(rn) STMT_START { \
 320     while (JUMPABLE(rn)) { \
 321         const OPCODE type = OP(rn); \
 322         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 323             rn = NEXTOPER(NEXTOPER(rn)); \
 324         else if (type == PLUS) \
 325             rn = NEXTOPER(rn); \
 326         else if (type == IFMATCH) \
 327             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 328         else rn += NEXT_OFF(rn); \
 329     } \
 330 } STMT_END
 331
 332 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 333  * These are for the pre-composed Hangul syllables, which are all in a
 334  * contiguous block and arranged there in such a way so as to facilitate
 335  * alorithmic determination of their characteristics.  As such, they don't need
 336  * a swash, but can be determined by simple arithmetic.  Almost all are
 337  * GCB=LVT, but every 28th one is a GCB=LV */
 338 #define SBASE 0xAC00    /* Start of block */
 339 #define SCount 11172    /* Length of block */
 340 #define TCount 28
 341
 342 static void restore_pos(pTHX_ void *arg);
 343
 344 #define REGCP_PAREN_ELEMS 3
 345 #define REGCP_OTHER_ELEMS 3
 346 #define REGCP_FRAME_ELEMS 1
 347 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 348  * are needed for the regexp context stack bookkeeping. */
 349
 350 STATIC CHECKPOINT
 351 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 352 {
 353     dVAR;
 354     const int retval = PL_savestack_ix;
 355     const int paren_elems_to_push =
 356                 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 357     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 358     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 359     I32 p;
 360     GET_RE_DEBUG_FLAGS_DECL;
 361
 362     PERL_ARGS_ASSERT_REGCPPUSH;
 363
 364     if (paren_elems_to_push < 0)
 365         Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 366                    paren_elems_to_push);
 367
 368     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 369         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 370                    " out of range (%lu-%ld)",
 371                    total_elems,
 372                    (unsigned long)maxopenparen,
 373                    (long)parenfloor);
 374
 375     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 376
 377     DEBUG_BUFFERS_r(
 378         if ((int)maxopenparen > (int)parenfloor)
 379             PerlIO_printf(Perl_debug_log,
 380                 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 381                 PTR2UV(rex),
 382                 PTR2UV(rex->offs)
 383             );
 384     );
 385     for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 386 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 387         SSPUSHINT(rex->offs[p].end);
 388         SSPUSHINT(rex->offs[p].start);
 389         SSPUSHINT(rex->offs[p].start_tmp);
 390         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 391             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 392             (UV)p,
 393             (IV)rex->offs[p].start,
 394             (IV)rex->offs[p].start_tmp,
 395             (IV)rex->offs[p].end
 396         ));
 397     }
 398 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 399     SSPUSHINT(maxopenparen);
 400     SSPUSHINT(rex->lastparen);
 401     SSPUSHINT(rex->lastcloseparen);
 402     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 403
 404     return retval;
 405 }
 406
 407 /* These are needed since we do not localize EVAL nodes: */
 408 #define REGCP_SET(cp)                                           \
 409     DEBUG_STATE_r(                                              \
 410             PerlIO_printf(Perl_debug_log,                       \
 411                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 412                 (IV)PL_savestack_ix));                          \
 413     cp = PL_savestack_ix
 414
 415 #define REGCP_UNWIND(cp)                                        \
 416     DEBUG_STATE_r(                                              \
 417         if (cp != PL_savestack_ix)                              \
 418             PerlIO_printf(Perl_debug_log,                       \
 419                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 420                 (IV)(cp), (IV)PL_savestack_ix));                \
 421     regcpblow(cp)
 422
 423 #define UNWIND_PAREN(lp, lcp)               \
 424     for (n = rex->lastparen; n > lp; n--)   \
 425         rex->offs[n].end = -1;              \
 426     rex->lastparen = n;                     \
 427     rex->lastcloseparen = lcp;
 428
 429
 430 STATIC void
 431 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 432 {
 433     dVAR;
 434     UV i;
 435     U32 paren;
 436     GET_RE_DEBUG_FLAGS_DECL;
 437
 438     PERL_ARGS_ASSERT_REGCPPOP;
 439
 440     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 441     i = SSPOPUV;
 442     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 443     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 444     rex->lastcloseparen = SSPOPINT;
 445     rex->lastparen = SSPOPINT;
 446     *maxopenparen_p = SSPOPINT;
 447
 448     i -= REGCP_OTHER_ELEMS;
 449     /* Now restore the parentheses context. */
 450     DEBUG_BUFFERS_r(
 451         if (i || rex->lastparen + 1 <= rex->nparens)
 452             PerlIO_printf(Perl_debug_log,
 453                 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 454                 PTR2UV(rex),
 455                 PTR2UV(rex->offs)
 456             );
 457     );
 458     paren = *maxopenparen_p;
 459     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 460         I32 tmps;
 461         rex->offs[paren].start_tmp = SSPOPINT;
 462         rex->offs[paren].start = SSPOPINT;
 463         tmps = SSPOPINT;
 464         if (paren <= rex->lastparen)
 465             rex->offs[paren].end = tmps;
 466         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 467             "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 468             (UV)paren,
 469             (IV)rex->offs[paren].start,
 470             (IV)rex->offs[paren].start_tmp,
 471             (IV)rex->offs[paren].end,
 472             (paren > rex->lastparen ? "(skipped)" : ""));
 473         );
 474         paren--;
 475     }
 476 #if 1
 477     /* It would seem that the similar code in regtry()
 478      * already takes care of this, and in fact it is in
 479      * a better location to since this code can #if 0-ed out
 480      * but the code in regtry() is needed or otherwise tests
 481      * requiring null fields (pat.t#187 and split.t#{13,14}
 482      * (as of patchlevel 7877)  will fail.  Then again,
 483      * this code seems to be necessary or otherwise
 484      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 485      * --jhi updated by dapm */
 486     for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 487         if (i > *maxopenparen_p)
 488             rex->offs[i].start = -1;
 489         rex->offs[i].end = -1;
 490         DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 491             "    \\%"UVuf": %s   ..-1 undeffing\n",
 492             (UV)i,
 493             (i > *maxopenparen_p) ? "-1" : "  "
 494         ));
 495     }
 496 #endif
 497 }
 498
 499 /* restore the parens and associated vars at savestack position ix,
 500  * but without popping the stack */
 501
 502 STATIC void
 503 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 504 {
 505     I32 tmpix = PL_savestack_ix;
 506     PL_savestack_ix = ix;
 507     regcppop(rex, maxopenparen_p);
 508     PL_savestack_ix = tmpix;
 509 }
 510
 511 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 512
 513 STATIC bool
 514 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 515 {
 516     /* Returns a boolean as to whether or not 'character' is a member of the
 517      * Posix character class given by 'classnum' that should be equivalent to a
 518      * value in the typedef '_char_class_number'.
 519      *
 520      * Ideally this could be replaced by a just an array of function pointers
 521      * to the C library functions that implement the macros this calls.
 522      * However, to compile, the precise function signatures are required, and
 523      * these may vary from platform to to platform.  To avoid having to figure
 524      * out what those all are on each platform, I (khw) am using this method,
 525      * which adds an extra layer of function call overhead.  But we don't
 526      * particularly care about performance with locales anyway. */
 527
 528     switch ((_char_class_number) classnum) {
 529         case _CC_ENUM_ALNUMC:    return isALNUMC_LC(character);
 530         case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 531         case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 532         case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 533         case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 534         case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 535         case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 536         case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 537         case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 538         case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 539         case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 540         case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 541         case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 542         case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 543         case _CC_ENUM_ASCII:     return isASCII_LC(character);
 544         default:    /* VERTSPACE should never occur in locales */
 545             Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 546     }
 547
 548     assert(0); /* NOTREACHED */
 549     return FALSE;
 550 }
 551
 552 /*
 553  * pregexec and friends
 554  */
 555
 556 #ifndef PERL_IN_XSUB_RE
 557 /*
 558  - pregexec - match a regexp against a string
 559  */
 560 I32
 561 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 562          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 563 /* stringarg: the point in the string at which to begin matching */
 564 /* strend:    pointer to null at end of string */
 565 /* strbeg:    real beginning of string */
 566 /* minend:    end of match must be >= minend bytes after stringarg. */
 567 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 568  *            itself is accessed via the pointers above */
 569 /* nosave:    For optimizations. */
 570 {
 571     PERL_ARGS_ASSERT_PREGEXEC;
 572
 573     return
 574         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 575                       nosave ? 0 : REXEC_COPY_STR);
 576 }
 577 #endif
 578
 579 /*
 580  * Need to implement the following flags for reg_anch:
 581  *
 582  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 583  * USE_INTUIT_ML
 584  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 585  * INTUIT_AUTORITATIVE_ML
 586  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 587  * INTUIT_ONCE_ML
 588  *
 589  * Another flag for this function: SECOND_TIME (so that float substrs
 590  * with giant delta may be not rechecked).
 591  */
 592
 593 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 594
 595 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 596    Otherwise, only SvCUR(sv) is used to get strbeg. */
 597
 598 /* XXXX We assume that strpos is strbeg unless sv. */
 599
 600 /* XXXX Some places assume that there is a fixed substring.
 601         An update may be needed if optimizer marks as "INTUITable"
 602         RExen without fixed substrings.  Similarly, it is assumed that
 603         lengths of all the strings are no more than minlen, thus they
 604         cannot come from lookahead.
 605         (Or minlen should take into account lookahead.)
 606   NOTE: Some of this comment is not correct. minlen does now take account
 607   of lookahead/behind. Further research is required. -- demerphq
 608
 609 */
 610
 611 /* A failure to find a constant substring means that there is no need to make
 612    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 613    finding a substring too deep into the string means that less calls to
 614    regtry() should be needed.
 615
 616    REx compiler's optimizer found 4 possible hints:
 617         a) Anchored substring;
 618         b) Fixed substring;
 619         c) Whether we are anchored (beginning-of-line or \G);
 620         d) First node (of those at offset 0) which may distinguish positions;
 621    We use a)b)d) and multiline-part of c), and try to find a position in the
 622    string which does not contradict any of them.
 623  */
 624
 625 /* Most of decisions we do here should have been done at compile time.
 626    The nodes of the REx which we used for the search should have been
 627    deleted from the finite automaton. */
 628
 629 char *
 630 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 631                      char *strend, const U32 flags, re_scream_pos_data *data)
 632 {
 633     dVAR;
 634     struct regexp *const prog = ReANY(rx);
 635     I32 start_shift = 0;
 636     /* Should be nonnegative! */
 637     I32 end_shift   = 0;
 638     char *s;
 639     SV *check;
 640     char *strbeg;
 641     char *t;
 642     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 643     I32 ml_anch;
 644     char *other_last = NULL;    /* other substr checked before this */
 645     char *check_at = NULL;              /* check substr found at this pos */
 646     char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 647     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 648     RXi_GET_DECL(prog,progi);
 649 #ifdef DEBUGGING
 650     const char * const i_strpos = strpos;
 651 #endif
 652     GET_RE_DEBUG_FLAGS_DECL;
 653
 654     PERL_ARGS_ASSERT_RE_INTUIT_START;
 655     PERL_UNUSED_ARG(flags);
 656     PERL_UNUSED_ARG(data);
 657
 658     RX_MATCH_UTF8_set(rx,utf8_target);
 659
 660     if (RX_UTF8(rx)) {
 661         PL_reg_flags |= RF_utf8;
 662     }
 663     DEBUG_EXECUTE_r(
 664         debug_start_match(rx, utf8_target, strpos, strend,
 665             sv ? "Guessing start of match in sv for"
 666                : "Guessing start of match in string for");
 667               );
 668
 669     /* CHR_DIST() would be more correct here but it makes things slow. */
 670     if (prog->minlen > strend - strpos) {
 671         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 672                               "String too short... [re_intuit_start]\n"));
 673         goto fail;
 674     }
 675
 676     /* XXX we need to pass strbeg as a separate arg: the following is
 677      * guesswork and can be wrong... */
 678     if (sv && SvPOK(sv)) {
 679         char * p   = SvPVX(sv);
 680         STRLEN cur = SvCUR(sv);
 681         if (p <= strpos && strpos < p + cur) {
 682             strbeg = p;
 683             assert(p <= strend && strend <= p + cur);
 684         }
 685         else
 686             strbeg = strend - cur;
 687     }
 688     else
 689         strbeg = strpos;
 690
 691     PL_regeol = strend;
 692     if (utf8_target) {
 693         if (!prog->check_utf8 && prog->check_substr)
 694             to_utf8_substr(prog);
 695         check = prog->check_utf8;
 696     } else {
 697         if (!prog->check_substr && prog->check_utf8) {
 698             if (! to_byte_substr(prog)) {
 699                 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 700             }
 701         }
 702         check = prog->check_substr;
 703     }
 704     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 705         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 706                      || ( (prog->extflags & RXf_ANCH_BOL)
 707                           && !multiline ) );    /* Check after \n? */
 708
 709         if (!ml_anch) {
 710           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 711                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 712                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 713                && sv && !SvROK(sv)
 714                && (strpos != strbeg)) {
 715               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 716               goto fail;
 717           }
 718           if (prog->check_offset_min == prog->check_offset_max
 719               && !(prog->extflags & RXf_CANY_SEEN)
 720               && ! multiline)   /* /m can cause \n's to match that aren't
 721                                    accounted for in the string max length.
 722                                    See [perl #115242] */
 723           {
 724             /* Substring at constant offset from beg-of-str... */
 725             I32 slen;
 726
 727             s = HOP3c(strpos, prog->check_offset_min, strend);
 728
 729             if (SvTAIL(check)) {
 730                 slen = SvCUR(check);    /* >= 1 */
 731
 732                 if ( strend - s > slen || strend - s < slen - 1
 733                      || (strend - s == slen && strend[-1] != '\n')) {
 734                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 735                     goto fail_finish;
 736                 }
 737                 /* Now should match s[0..slen-2] */
 738                 slen--;
 739                 if (slen && (*SvPVX_const(check) != *s
 740                              || (slen > 1
 741                                  && memNE(SvPVX_const(check), s, slen)))) {
 742                   report_neq:
 743                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 744                     goto fail_finish;
 745                 }
 746             }
 747             else if (*SvPVX_const(check) != *s
 748                      || ((slen = SvCUR(check)) > 1
 749                          && memNE(SvPVX_const(check), s, slen)))
 750                 goto report_neq;
 751             check_at = s;
 752             goto success_at_start;
 753           }
 754         }
 755         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 756         s = strpos;
 757         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 758         end_shift = prog->check_end_shift;
 759
 760         if (!ml_anch) {
 761             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 762                                          - (SvTAIL(check) != 0);
 763             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 764
 765             if (end_shift < eshift)
 766                 end_shift = eshift;
 767         }
 768     }
 769     else {                              /* Can match at random position */
 770         ml_anch = 0;
 771         s = strpos;
 772         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 773         end_shift = prog->check_end_shift;
 774
 775         /* end shift should be non negative here */
 776     }
 777
 778 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 779     if (end_shift < 0)
 780         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 781                    (IV)end_shift, RX_PRECOMP(prog));
 782 #endif
 783
 784   restart:
 785     /* Find a possible match in the region s..strend by looking for
 786        the "check" substring in the region corrected by start/end_shift. */
 787
 788     {
 789         I32 srch_start_shift = start_shift;
 790         I32 srch_end_shift = end_shift;
 791         U8* start_point;
 792         U8* end_point;
 793         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 794             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 795             srch_start_shift = strbeg - s;
 796         }
 797     DEBUG_OPTIMISE_MORE_r({
 798         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 799             (IV)prog->check_offset_min,
 800             (IV)srch_start_shift,
 801             (IV)srch_end_shift,
 802             (IV)prog->check_end_shift);
 803     });
 804
 805         if (prog->extflags & RXf_CANY_SEEN) {
 806             start_point= (U8*)(s + srch_start_shift);
 807             end_point= (U8*)(strend - srch_end_shift);
 808         } else {
 809             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 810             end_point= HOP3(strend, -srch_end_shift, strbeg);
 811         }
 812         DEBUG_OPTIMISE_MORE_r({
 813             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 814                 (int)(end_point - start_point),
 815                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 816                 start_point);
 817         });
 818
 819         s = fbm_instr( start_point, end_point,
 820                       check, multiline ? FBMrf_MULTILINE : 0);
 821     }
 822     /* Update the count-of-usability, remove useless subpatterns,
 823         unshift s.  */
 824
 825     DEBUG_EXECUTE_r({
 826         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 827             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 828         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 829                           (s ? "Found" : "Did not find"),
 830             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 831                 ? "anchored" : "floating"),
 832             quoted,
 833             RE_SV_TAIL(check),
 834             (s ? " at offset " : "...\n") );
 835     });
 836
 837     if (!s)
 838         goto fail_finish;
 839     /* Finish the diagnostic message */
 840     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 841
 842     /* XXX dmq: first branch is for positive lookbehind...
 843        Our check string is offset from the beginning of the pattern.
 844        So we need to do any stclass tests offset forward from that
 845        point. I think. :-(
 846      */
 847
 848
 849
 850     check_at=s;
 851
 852
 853     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 854        Start with the other substr.
 855        XXXX no SCREAM optimization yet - and a very coarse implementation
 856        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 857                 *always* match.  Probably should be marked during compile...
 858        Probably it is right to do no SCREAM here...
 859      */
 860
 861     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 862                 : (prog->float_substr && prog->anchored_substr))
 863     {
 864         /* Take into account the "other" substring. */
 865         /* XXXX May be hopelessly wrong for UTF... */
 866         if (!other_last)
 867             other_last = strpos;
 868         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 869           do_other_anchored:
 870             {
 871                 char * const last = HOP3c(s, -start_shift, strbeg);
 872                 char *last1, *last2;
 873                 char * const saved_s = s;
 874                 SV* must;
 875
 876                 t = s - prog->check_offset_max;
 877                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 878                     && (!utf8_target
 879                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 880                             && t > strpos)))
 881                     NOOP;
 882                 else
 883                     t = strpos;
 884                 t = HOP3c(t, prog->anchored_offset, strend);
 885                 if (t < other_last)     /* These positions already checked */
 886                     t = other_last;
 887                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 888                 if (last < last1)
 889                     last1 = last;
 890                 /* XXXX It is not documented what units *_offsets are in.
 891                    We assume bytes, but this is clearly wrong.
 892                    Meaning this code needs to be carefully reviewed for errors.
 893                    dmq.
 894                   */
 895
 896                 /* On end-of-str: see comment below. */
 897                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 898                 if (must == &PL_sv_undef) {
 899                     s = (char*)NULL;
 900                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 901                 }
 902                 else
 903                     s = fbm_instr(
 904                         (unsigned char*)t,
 905                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 906                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 907                         must,
 908                         multiline ? FBMrf_MULTILINE : 0
 909                     );
 910                 DEBUG_EXECUTE_r({
 911                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 912                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 913                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 914                         (s ? "Found" : "Contradicts"),
 915                         quoted, RE_SV_TAIL(must));
 916                 });
 917
 918
 919                 if (!s) {
 920                     if (last1 >= last2) {
 921                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 922                                                 ", giving up...\n"));
 923                         goto fail_finish;
 924                     }
 925                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 926                         ", trying floating at offset %ld...\n",
 927                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 928                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 929                     s = HOP3c(last, 1, strend);
 930                     goto restart;
 931                 }
 932                 else {
 933                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 934                           (long)(s - i_strpos)));
 935                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 936                     other_last = HOP3c(s, 1, strend);
 937                     s = saved_s;
 938                     if (t == strpos)
 939                         goto try_at_start;
 940                     goto try_at_offset;
 941                 }
 942             }
 943         }
 944         else {          /* Take into account the floating substring. */
 945             char *last, *last1;
 946             char * const saved_s = s;
 947             SV* must;
 948
 949             t = HOP3c(s, -start_shift, strbeg);
 950             last1 = last =
 951                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 952             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 953                 last = HOP3c(t, prog->float_max_offset, strend);
 954             s = HOP3c(t, prog->float_min_offset, strend);
 955             if (s < other_last)
 956                 s = other_last;
 957  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 958             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 959             /* fbm_instr() takes into account exact value of end-of-str
 960                if the check is SvTAIL(ed).  Since false positives are OK,
 961                and end-of-str is not later than strend we are OK. */
 962             if (must == &PL_sv_undef) {
 963                 s = (char*)NULL;
 964                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 965             }
 966             else
 967                 s = fbm_instr((unsigned char*)s,
 968                               (unsigned char*)last + SvCUR(must)
 969                                   - (SvTAIL(must)!=0),
 970                               must, multiline ? FBMrf_MULTILINE : 0);
 971             DEBUG_EXECUTE_r({
 972                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 973                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 974                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 975                     (s ? "Found" : "Contradicts"),
 976                     quoted, RE_SV_TAIL(must));
 977             });
 978             if (!s) {
 979                 if (last1 == last) {
 980                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 981                                             ", giving up...\n"));
 982                     goto fail_finish;
 983                 }
 984                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 985                     ", trying anchored starting at offset %ld...\n",
 986                     (long)(saved_s + 1 - i_strpos)));
 987                 other_last = last;
 988                 s = HOP3c(t, 1, strend);
 989                 goto restart;
 990             }
 991             else {
 992                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 993                       (long)(s - i_strpos)));
 994                 other_last = s; /* Fix this later. --Hugo */
 995                 s = saved_s;
 996                 if (t == strpos)
 997                     goto try_at_start;
 998                 goto try_at_offset;
 999             }
1000         }
1001     }
1002
1003
1004     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1005
1006     DEBUG_OPTIMISE_MORE_r(
1007         PerlIO_printf(Perl_debug_log,
1008             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
1009             (IV)prog->check_offset_min,
1010             (IV)prog->check_offset_max,
1011             (IV)(s-strpos),
1012             (IV)(t-strpos),
1013             (IV)(t-s),
1014             (IV)(strend-strpos)
1015         )
1016     );
1017
1018     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
1019         && (!utf8_target
1020             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1021                  && t > strpos)))
1022     {
1023         /* Fixed substring is found far enough so that the match
1024            cannot start at strpos. */
1025       try_at_offset:
1026         if (ml_anch && t[-1] != '\n') {
1027             /* Eventually fbm_*() should handle this, but often
1028                anchored_offset is not 0, so this check will not be wasted. */
1029             /* XXXX In the code below we prefer to look for "^" even in
1030                presence of anchored substrings.  And we search even
1031                beyond the found float position.  These pessimizations
1032                are historical artefacts only.  */
1033           find_anchor:
1034             while (t < strend - prog->minlen) {
1035                 if (*t == '\n') {
1036                     if (t < check_at - prog->check_offset_min) {
1037                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1038                             /* Since we moved from the found position,
1039                                we definitely contradict the found anchored
1040                                substr.  Due to the above check we do not
1041                                contradict "check" substr.
1042                                Thus we can arrive here only if check substr
1043                                is float.  Redo checking for "other"=="fixed".
1044                              */
1045                             strpos = t + 1;
1046                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1047                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1048                             goto do_other_anchored;
1049                         }
1050                         /* We don't contradict the found floating substring. */
1051                         /* XXXX Why not check for STCLASS? */
1052                         s = t + 1;
1053                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1054                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1055                         goto set_useful;
1056                     }
1057                     /* Position contradicts check-string */
1058                     /* XXXX probably better to look for check-string
1059                        than for "\n", so one should lower the limit for t? */
1060                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1061                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1062                     other_last = strpos = s = t + 1;
1063                     goto restart;
1064                 }
1065                 t++;
1066             }
1067             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1068                         PL_colors[0], PL_colors[1]));
1069             goto fail_finish;
1070         }
1071         else {
1072             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1073                         PL_colors[0], PL_colors[1]));
1074         }
1075         s = t;
1076       set_useful:
1077         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
1078     }
1079     else {
1080         /* The found string does not prohibit matching at strpos,
1081            - no optimization of calling REx engine can be performed,
1082            unless it was an MBOL and we are not after MBOL,
1083            or a future STCLASS check will fail this. */
1084       try_at_start:
1085         /* Even in this situation we may use MBOL flag if strpos is offset
1086            wrt the start of the string. */
1087         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1088             && (strpos != strbeg) && strpos[-1] != '\n'
1089             /* May be due to an implicit anchor of m{.*foo}  */
1090             && !(prog->intflags & PREGf_IMPLICIT))
1091         {
1092             t = strpos;
1093             goto find_anchor;
1094         }
1095         DEBUG_EXECUTE_r( if (ml_anch)
1096             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1097                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1098         );
1099       success_at_start:
1100         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1101             && (utf8_target ? (
1102                 prog->check_utf8                /* Could be deleted already */
1103                 && --BmUSEFUL(prog->check_utf8) < 0
1104                 && (prog->check_utf8 == prog->float_utf8)
1105             ) : (
1106                 prog->check_substr              /* Could be deleted already */
1107                 && --BmUSEFUL(prog->check_substr) < 0
1108                 && (prog->check_substr == prog->float_substr)
1109             )))
1110         {
1111             /* If flags & SOMETHING - do not do it many times on the same match */
1112             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1113             /* XXX Does the destruction order has to change with utf8_target? */
1114             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1115             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1116             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1117             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1118             check = NULL;                       /* abort */
1119             s = strpos;
1120             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1121                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1122             if (prog->intflags & PREGf_IMPLICIT)
1123                 prog->extflags &= ~RXf_ANCH_MBOL;
1124             /* XXXX This is a remnant of the old implementation.  It
1125                     looks wasteful, since now INTUIT can use many
1126                     other heuristics. */
1127             prog->extflags &= ~RXf_USE_INTUIT;
1128             /* XXXX What other flags might need to be cleared in this branch? */
1129         }
1130         else
1131             s = strpos;
1132     }
1133
1134     /* Last resort... */
1135     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1136     /* trie stclasses are too expensive to use here, we are better off to
1137        leave it to regmatch itself */
1138     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1139         /* minlen == 0 is possible if regstclass is \b or \B,
1140            and the fixed substr is ''$.
1141            Since minlen is already taken into account, s+1 is before strend;
1142            accidentally, minlen >= 1 guaranties no false positives at s + 1
1143            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1144            regstclass does not come from lookahead...  */
1145         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1146            This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1147         const U8* const str = (U8*)STRING(progi->regstclass);
1148         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1149                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1150                     : 1);
1151         char * endpos;
1152         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1153             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1154         else if (prog->float_substr || prog->float_utf8)
1155             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1156         else
1157             endpos= strend;
1158
1159         if (checked_upto < s)
1160            checked_upto = s;
1161         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1162                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1163
1164         t = s;
1165         s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1166         if (s) {
1167             checked_upto = s;
1168         } else {
1169 #ifdef DEBUGGING
1170             const char *what = NULL;
1171 #endif
1172             if (endpos == strend) {
1173                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1174                                 "Could not match STCLASS...\n") );
1175                 goto fail;
1176             }
1177             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1178                                    "This position contradicts STCLASS...\n") );
1179             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1180                 goto fail;
1181             checked_upto = HOPBACKc(endpos, start_shift);
1182             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1183                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1184             /* Contradict one of substrings */
1185             if (prog->anchored_substr || prog->anchored_utf8) {
1186                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1187                     DEBUG_EXECUTE_r( what = "anchored" );
1188                   hop_and_restart:
1189                     s = HOP3c(t, 1, strend);
1190                     if (s + start_shift + end_shift > strend) {
1191                         /* XXXX Should be taken into account earlier? */
1192                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1193                                                "Could not match STCLASS...\n") );
1194                         goto fail;
1195                     }
1196                     if (!check)
1197                         goto giveup;
1198                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1199                                 "Looking for %s substr starting at offset %ld...\n",
1200                                  what, (long)(s + start_shift - i_strpos)) );
1201                     goto restart;
1202                 }
1203                 /* Have both, check_string is floating */
1204                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1205                     goto retry_floating_check;
1206                 /* Recheck anchored substring, but not floating... */
1207                 s = check_at;
1208                 if (!check)
1209                     goto giveup;
1210                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1211                           "Looking for anchored substr starting at offset %ld...\n",
1212                           (long)(other_last - i_strpos)) );
1213                 goto do_other_anchored;
1214             }
1215             /* Another way we could have checked stclass at the
1216                current position only: */
1217             if (ml_anch) {
1218                 s = t = t + 1;
1219                 if (!check)
1220                     goto giveup;
1221                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1222                           "Looking for /%s^%s/m starting at offset %ld...\n",
1223                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1224                 goto try_at_offset;
1225             }
1226             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1227                 goto fail;
1228             /* Check is floating substring. */
1229           retry_floating_check:
1230             t = check_at - start_shift;
1231             DEBUG_EXECUTE_r( what = "floating" );
1232             goto hop_and_restart;
1233         }
1234         if (t != s) {
1235             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1236                         "By STCLASS: moving %ld --> %ld\n",
1237                                   (long)(t - i_strpos), (long)(s - i_strpos))
1238                    );
1239         }
1240         else {
1241             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1242                                   "Does not contradict STCLASS...\n");
1243                    );
1244         }
1245     }
1246   giveup:
1247     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1248                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1249                           PL_colors[5], (long)(s - i_strpos)) );
1250     return s;
1251
1252   fail_finish:                          /* Substring not found */
1253     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1254         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1255   fail:
1256     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1257                           PL_colors[4], PL_colors[5]));
1258     return NULL;
1259 }
1260
1261 #define DECL_TRIE_TYPE(scan) \
1262     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1263                     trie_type = ((scan->flags == EXACT) \
1264                               ? (utf8_target ? trie_utf8 : trie_plain) \
1265                               : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1266
1267 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,          \
1268 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                               \
1269     STRLEN skiplen;                                                                 \
1270     switch (trie_type) {                                                            \
1271     case trie_utf8_fold:                                                            \
1272         if ( foldlen>0 ) {                                                          \
1273             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1274             foldlen -= len;                                                         \
1275             uscan += len;                                                           \
1276             len=0;                                                                  \
1277         } else {                                                                    \
1278             uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1279             len = UTF8SKIP(uc);                                                     \
1280             skiplen = UNISKIP( uvc );                                               \
1281             foldlen -= skiplen;                                                     \
1282             uscan = foldbuf + skiplen;                                              \
1283         }                                                                           \
1284         break;                                                                      \
1285     case trie_latin_utf8_fold:                                                      \
1286         if ( foldlen>0 ) {                                                          \
1287             uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1288             foldlen -= len;                                                         \
1289             uscan += len;                                                           \
1290             len=0;                                                                  \
1291         } else {                                                                    \
1292             len = 1;                                                                \
1293             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);                 \
1294             skiplen = UNISKIP( uvc );                                               \
1295             foldlen -= skiplen;                                                     \
1296             uscan = foldbuf + skiplen;                                              \
1297         }                                                                           \
1298         break;                                                                      \
1299     case trie_utf8:                                                                 \
1300         uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1301         break;                                                                      \
1302     case trie_plain:                                                                \
1303         uvc = (UV)*uc;                                                              \
1304         len = 1;                                                                    \
1305     }                                                                               \
1306     if (uvc < 256) {                                                                \
1307         charid = trie->charmap[ uvc ];                                              \
1308     }                                                                               \
1309     else {                                                                          \
1310         charid = 0;                                                                 \
1311         if (widecharmap) {                                                          \
1312             SV** const svpp = hv_fetch(widecharmap,                                 \
1313                         (char*)&uvc, sizeof(UV), 0);                                \
1314             if (svpp)                                                               \
1315                 charid = (U16)SvIV(*svpp);                                          \
1316         }                                                                           \
1317     }                                                                               \
1318 } STMT_END
1319
1320 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1321 STMT_START {                                              \
1322     while (s <= e) {                                      \
1323         if ( (CoNd)                                       \
1324              && (ln == 1 || folder(s, pat_string, ln))    \
1325              && (!reginfo || regtry(reginfo, &s)) )       \
1326             goto got_it;                                  \
1327         s++;                                              \
1328     }                                                     \
1329 } STMT_END
1330
1331 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1332 STMT_START {                                          \
1333     while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) {     \
1334         CoDe                                          \
1335         s += uskip;                                   \
1336     }                                                 \
1337 } STMT_END
1338
1339 #define REXEC_FBC_SCAN(CoDe)                          \
1340 STMT_START {                                          \
1341     while (s < strend) {                              \
1342         CoDe                                          \
1343         s++;                                          \
1344     }                                                 \
1345 } STMT_END
1346
1347 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1348 REXEC_FBC_UTF8_SCAN(                                  \
1349     if (CoNd) {                                       \
1350         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1351             goto got_it;                              \
1352         else                                          \
1353             tmp = doevery;                            \
1354     }                                                 \
1355     else                                              \
1356         tmp = 1;                                      \
1357 )
1358
1359 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1360 REXEC_FBC_SCAN(                                       \
1361     if (CoNd) {                                       \
1362         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1363             goto got_it;                              \
1364         else                                          \
1365             tmp = doevery;                            \
1366     }                                                 \
1367     else                                              \
1368         tmp = 1;                                      \
1369 )
1370
1371 #define REXEC_FBC_TRYIT               \
1372 if ((!reginfo || regtry(reginfo, &s))) \
1373     goto got_it
1374
1375 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1376     if (utf8_target) {                                             \
1377         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1378     }                                                          \
1379     else {                                                     \
1380         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1381     }
1382
1383 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1384     if (utf8_target) {                                             \
1385         UtFpReLoAd;                                            \
1386         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1387     }                                                          \
1388     else {                                                     \
1389         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1390     }
1391
1392 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1393     PL_reg_flags |= RF_tainted;                                \
1394     if (utf8_target) {                                             \
1395         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1396     }                                                          \
1397     else {                                                     \
1398         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1399     }
1400
1401 #define DUMP_EXEC_POS(li,s,doutf8) \
1402     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1403
1404
1405 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1406         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1407         tmp = TEST_NON_UTF8(tmp);                                              \
1408         REXEC_FBC_UTF8_SCAN(                                                   \
1409             if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1410                 tmp = !tmp;                                                    \
1411                 IF_SUCCESS;                                                    \
1412             }                                                                  \
1413             else {                                                             \
1414                 IF_FAIL;                                                       \
1415             }                                                                  \
1416         );                                                                     \
1417
1418 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1419         if (s == PL_bostr) {                                                   \
1420             tmp = '\n';                                                        \
1421         }                                                                      \
1422         else {                                                                 \
1423             U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1424             tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1425         }                                                                      \
1426         tmp = TeSt1_UtF8;                                                      \
1427         LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1428         REXEC_FBC_UTF8_SCAN(                                                   \
1429             if (tmp == ! (TeSt2_UtF8)) { \
1430                 tmp = !tmp;                                                    \
1431                 IF_SUCCESS;                                                    \
1432             }                                                                  \
1433             else {                                                             \
1434                 IF_FAIL;                                                       \
1435             }                                                                  \
1436         );                                                                     \
1437
1438 /* The only difference between the BOUND and NBOUND cases is that
1439  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1440  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1441  * with the other one being empty */
1442 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1443     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1444
1445 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1446     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1447
1448 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1449     FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1450
1451 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1452     FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1453
1454
1455 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1456  * be passed in completely with the variable name being tested, which isn't
1457  * such a clean interface, but this is easier to read than it was before.  We
1458  * are looking for the boundary (or non-boundary between a word and non-word
1459  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1460  * must be different.  Find the "wordness" of the character just prior to this
1461  * one, and compare it with the wordness of this one.  If they differ, we have
1462  * a boundary.  At the beginning of the string, pretend that the previous
1463  * character was a new-line */
1464 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1465     if (utf8_target) {                                                         \
1466                 UTF8_CODE \
1467     }                                                                          \
1468     else {  /* Not utf8 */                                                     \
1469         tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1470         tmp = TEST_NON_UTF8(tmp);                                              \
1471         REXEC_FBC_SCAN(                                                        \
1472             if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1473                 tmp = !tmp;                                                    \
1474                 IF_SUCCESS;                                                    \
1475             }                                                                  \
1476             else {                                                             \
1477                 IF_FAIL;                                                       \
1478             }                                                                  \
1479         );                                                                     \
1480     }                                                                          \
1481     if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1482         goto got_it;
1483
1484 /* We know what class REx starts with.  Try to find this position... */
1485 /* if reginfo is NULL, its a dryrun */
1486 /* annoyingly all the vars in this routine have different names from their counterparts
1487    in regmatch. /grrr */
1488
1489 STATIC char *
1490 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1491     const char *strend, regmatch_info *reginfo)
1492 {
1493     dVAR;
1494     const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1495     char *pat_string;   /* The pattern's exactish string */
1496     char *pat_end;          /* ptr to end char of pat_string */
1497     re_fold_t folder;   /* Function for computing non-utf8 folds */
1498     const U8 *fold_array;   /* array for folding ords < 256 */
1499     STRLEN ln;
1500     STRLEN lnc;
1501     STRLEN uskip;
1502     U8 c1;
1503     U8 c2;
1504     char *e;
1505     I32 tmp = 1;        /* Scratch variable? */
1506     const bool utf8_target = PL_reg_match_utf8;
1507     UV utf8_fold_flags = 0;
1508     RXi_GET_DECL(prog,progi);
1509
1510     PERL_ARGS_ASSERT_FIND_BYCLASS;
1511
1512     /* We know what class it must start with. */
1513     switch (OP(c)) {
1514     case ANYOF:
1515         if (utf8_target) {
1516             REXEC_FBC_UTF8_CLASS_SCAN(
1517                       reginclass(prog, c, (U8*)s, utf8_target));
1518         }
1519         else {
1520             REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1521         }
1522         break;
1523     case CANY:
1524         REXEC_FBC_SCAN(
1525             if (tmp && (!reginfo || regtry(reginfo, &s)))
1526                 goto got_it;
1527             else
1528                 tmp = doevery;
1529         );
1530         break;
1531
1532     case EXACTFA:
1533         if (UTF_PATTERN || utf8_target) {
1534             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1535             goto do_exactf_utf8;
1536         }
1537         fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1538         folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1539         goto do_exactf_non_utf8;        /* isn't dealt with by these */
1540
1541     case EXACTF:
1542         if (utf8_target) {
1543
1544             /* regcomp.c already folded this if pattern is in UTF-8 */
1545             utf8_fold_flags = 0;
1546             goto do_exactf_utf8;
1547         }
1548         fold_array = PL_fold;
1549         folder = foldEQ;
1550         goto do_exactf_non_utf8;
1551
1552     case EXACTFL:
1553         if (UTF_PATTERN || utf8_target) {
1554             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1555             goto do_exactf_utf8;
1556         }
1557         fold_array = PL_fold_locale;
1558         folder = foldEQ_locale;
1559         goto do_exactf_non_utf8;
1560
1561     case EXACTFU_SS:
1562         if (UTF_PATTERN) {
1563             utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1564         }
1565         goto do_exactf_utf8;
1566
1567     case EXACTFU_TRICKYFOLD:
1568     case EXACTFU:
1569         if (UTF_PATTERN || utf8_target) {
1570             utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1571             goto do_exactf_utf8;
1572         }
1573
1574         /* Any 'ss' in the pattern should have been replaced by regcomp,
1575          * so we don't have to worry here about this single special case
1576          * in the Latin1 range */
1577         fold_array = PL_fold_latin1;
1578         folder = foldEQ_latin1;
1579
1580         /* FALL THROUGH */
1581
1582     do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1583                            are no glitches with fold-length differences
1584                            between the target string and pattern */
1585
1586         /* The idea in the non-utf8 EXACTF* cases is to first find the
1587          * first character of the EXACTF* node and then, if necessary,
1588          * case-insensitively compare the full text of the node.  c1 is the
1589          * first character.  c2 is its fold.  This logic will not work for
1590          * Unicode semantics and the german sharp ss, which hence should
1591          * not be compiled into a node that gets here. */
1592         pat_string = STRING(c);
1593         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1594
1595         /* We know that we have to match at least 'ln' bytes (which is the
1596          * same as characters, since not utf8).  If we have to match 3
1597          * characters, and there are only 2 availabe, we know without
1598          * trying that it will fail; so don't start a match past the
1599          * required minimum number from the far end */
1600         e = HOP3c(strend, -((I32)ln), s);
1601
1602         if (!reginfo && e < s) {
1603             e = s;                      /* Due to minlen logic of intuit() */
1604         }
1605
1606         c1 = *pat_string;
1607         c2 = fold_array[c1];
1608         if (c1 == c2) { /* If char and fold are the same */
1609             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1610         }
1611         else {
1612             REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1613         }
1614         break;
1615
1616     do_exactf_utf8:
1617     {
1618         unsigned expansion;
1619
1620         /* If one of the operands is in utf8, we can't use the simpler folding
1621          * above, due to the fact that many different characters can have the
1622          * same fold, or portion of a fold, or different- length fold */
1623         pat_string = STRING(c);
1624         ln  = STR_LEN(c);       /* length to match in octets/bytes */
1625         pat_end = pat_string + ln;
1626         lnc = (UTF_PATTERN)     /* length to match in characters */
1627                 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1628                 : ln;
1629
1630         /* We have 'lnc' characters to match in the pattern, but because of
1631          * multi-character folding, each character in the target can match
1632          * up to 3 characters (Unicode guarantees it will never exceed
1633          * this) if it is utf8-encoded; and up to 2 if not (based on the
1634          * fact that the Latin 1 folds are already determined, and the
1635          * only multi-char fold in that range is the sharp-s folding to
1636          * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1637          * string character.  Adjust lnc accordingly, rounding up, so that
1638          * if we need to match at least 4+1/3 chars, that really is 5. */
1639         expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1640         lnc = (lnc + expansion - 1) / expansion;
1641
1642         /* As in the non-UTF8 case, if we have to match 3 characters, and
1643          * only 2 are left, it's guaranteed to fail, so don't start a
1644          * match that would require us to go beyond the end of the string
1645          */
1646         e = HOP3c(strend, -((I32)lnc), s);
1647
1648         if (!reginfo && e < s) {
1649             e = s;                      /* Due to minlen logic of intuit() */
1650         }
1651
1652         /* XXX Note that we could recalculate e to stop the loop earlier,
1653          * as the worst case expansion above will rarely be met, and as we
1654          * go along we would usually find that e moves further to the left.
1655          * This would happen only after we reached the point in the loop
1656          * where if there were no expansion we should fail.  Unclear if
1657          * worth the expense */
1658
1659         while (s <= e) {
1660             char *my_strend= (char *)strend;
1661             if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1662                   pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1663                 && (!reginfo || regtry(reginfo, &s)) )
1664             {
1665                 goto got_it;
1666             }
1667             s += (utf8_target) ? UTF8SKIP(s) : 1;
1668         }
1669         break;
1670     }
1671     case BOUNDL:
1672         PL_reg_flags |= RF_tainted;
1673         FBC_BOUND(isALNUM_LC,
1674                   isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1675                   isALNUM_LC_utf8((U8*)s));
1676         break;
1677     case NBOUNDL:
1678         PL_reg_flags |= RF_tainted;
1679         FBC_NBOUND(isALNUM_LC,
1680                    isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1681                    isALNUM_LC_utf8((U8*)s));
1682         break;
1683     case BOUND:
1684         FBC_BOUND(isWORDCHAR,
1685                   isALNUM_uni(tmp),
1686                   cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1687         break;
1688     case BOUNDA:
1689         FBC_BOUND_NOLOAD(isWORDCHAR_A,
1690                          isWORDCHAR_A(tmp),
1691                          isWORDCHAR_A((U8*)s));
1692         break;
1693     case NBOUND:
1694         FBC_NBOUND(isWORDCHAR,
1695                    isALNUM_uni(tmp),
1696                    cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1697         break;
1698     case NBOUNDA:
1699         FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1700                           isWORDCHAR_A(tmp),
1701                           isWORDCHAR_A((U8*)s));
1702         break;
1703     case BOUNDU:
1704         FBC_BOUND(isWORDCHAR_L1,
1705                   isALNUM_uni(tmp),
1706                   cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1707         break;
1708     case NBOUNDU:
1709         FBC_NBOUND(isWORDCHAR_L1,
1710                    isALNUM_uni(tmp),
1711                    cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1712         break;
1713     case ALNUML:
1714         REXEC_FBC_CSCAN_TAINT(
1715             isALNUM_LC_utf8((U8*)s),
1716             isALNUM_LC(*s)
1717         );
1718         break;
1719     case ALNUMU:
1720         REXEC_FBC_CSCAN_PRELOAD(
1721             LOAD_UTF8_CHARCLASS_ALNUM(),
1722             swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1723             isWORDCHAR_L1((U8) *s)
1724         );
1725         break;
1726     case ALNUM:
1727         REXEC_FBC_CSCAN_PRELOAD(
1728             LOAD_UTF8_CHARCLASS_ALNUM(),
1729             swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1730             isWORDCHAR((U8) *s)
1731         );
1732         break;
1733     case ALNUMA:
1734         /* Don't need to worry about utf8, as it can match only a single
1735          * byte invariant character */
1736         REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1737         break;
1738     case NALNUMU:
1739         REXEC_FBC_CSCAN_PRELOAD(
1740             LOAD_UTF8_CHARCLASS_ALNUM(),
1741             !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1742             ! isWORDCHAR_L1((U8) *s)
1743         );
1744         break;
1745     case NALNUM:
1746         REXEC_FBC_CSCAN_PRELOAD(
1747             LOAD_UTF8_CHARCLASS_ALNUM(),
1748             !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1749             ! isALNUM(*s)
1750         );
1751         break;
1752     case NALNUMA:
1753         REXEC_FBC_CSCAN(
1754             !isWORDCHAR_A(*s),
1755             !isWORDCHAR_A(*s)
1756         );
1757         break;
1758     case NALNUML:
1759         REXEC_FBC_CSCAN_TAINT(
1760             !isALNUM_LC_utf8((U8*)s),
1761             !isALNUM_LC(*s)
1762         );
1763         break;
1764     case SPACEU:
1765         REXEC_FBC_CSCAN(
1766             is_XPERLSPACE_utf8(s),
1767             isSPACE_L1((U8) *s)
1768         );
1769         break;
1770     case SPACE:
1771         REXEC_FBC_CSCAN(
1772             is_XPERLSPACE_utf8(s),
1773             isSPACE((U8) *s)
1774         );
1775         break;
1776     case SPACEA:
1777         /* Don't need to worry about utf8, as it can match only a single
1778          * byte invariant character */
1779         REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1780         break;
1781     case SPACEL:
1782         REXEC_FBC_CSCAN_TAINT(
1783             isSPACE_LC_utf8((U8*)s),
1784             isSPACE_LC(*s)
1785         );
1786         break;
1787     case NSPACEU:
1788         REXEC_FBC_CSCAN(
1789             ! is_XPERLSPACE_utf8(s),
1790             ! isSPACE_L1((U8) *s)
1791         );
1792         break;
1793     case NSPACE:
1794         REXEC_FBC_CSCAN(
1795             ! is_XPERLSPACE_utf8(s),
1796             ! isSPACE((U8) *s)
1797         );
1798         break;
1799     case NSPACEA:
1800         REXEC_FBC_CSCAN(
1801             !isSPACE_A(*s),
1802             !isSPACE_A(*s)
1803         );
1804         break;
1805     case NSPACEL:
1806         REXEC_FBC_CSCAN_TAINT(
1807             !isSPACE_LC_utf8((U8*)s),
1808             !isSPACE_LC(*s)
1809         );
1810         break;
1811     case DIGIT:
1812         REXEC_FBC_CSCAN_PRELOAD(
1813             LOAD_UTF8_CHARCLASS_DIGIT(),
1814             swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1815             isDIGIT(*s)
1816         );
1817         break;
1818     case DIGITA:
1819         /* Don't need to worry about utf8, as it can match only a single
1820          * byte invariant character */
1821         REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1822         break;
1823     case DIGITL:
1824         REXEC_FBC_CSCAN_TAINT(
1825             isDIGIT_LC_utf8((U8*)s),
1826             isDIGIT_LC(*s)
1827         );
1828         break;
1829     case NDIGIT:
1830         REXEC_FBC_CSCAN_PRELOAD(
1831             LOAD_UTF8_CHARCLASS_DIGIT(),
1832             !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1833             !isDIGIT(*s)
1834         );
1835         break;
1836     case NDIGITA:
1837         REXEC_FBC_CSCAN(
1838             !isDIGIT_A(*s),
1839             !isDIGIT_A(*s)
1840         );
1841         break;
1842     case NDIGITL:
1843         REXEC_FBC_CSCAN_TAINT(
1844             !isDIGIT_LC_utf8((U8*)s),
1845             !isDIGIT_LC(*s)
1846         );
1847         break;
1848     case LNBREAK:
1849         REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1850                         is_LNBREAK_latin1_safe(s, strend)
1851         );
1852         break;
1853     case VERTWS:
1854         REXEC_FBC_CSCAN(
1855             is_VERTWS_utf8_safe(s, strend),
1856             is_VERTWS_latin1_safe(s, strend)
1857         );
1858         break;
1859     case NVERTWS:
1860         REXEC_FBC_CSCAN(
1861             !is_VERTWS_utf8_safe(s, strend),
1862             !is_VERTWS_latin1_safe(s, strend)
1863         );
1864         break;
1865     case HORIZWS:
1866         REXEC_FBC_CSCAN(
1867             is_HORIZWS_utf8_safe(s, strend),
1868             is_HORIZWS_latin1_safe(s, strend)
1869         );
1870         break;
1871     case NHORIZWS:
1872         REXEC_FBC_CSCAN(
1873             !is_HORIZWS_utf8_safe(s, strend),
1874             !is_HORIZWS_latin1_safe(s, strend)
1875         );
1876         break;
1877     case POSIXA:
1878         /* Don't need to worry about utf8, as it can match only a single
1879         * byte invariant character.  The flag in this node type is the
1880         * class number to pass to _generic_isCC() to build a mask for
1881         * searching in PL_charclass[] */
1882         REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1883         break;
1884     case NPOSIXA:
1885         REXEC_FBC_CSCAN(
1886             !_generic_isCC_A(*s, FLAGS(c)),
1887             !_generic_isCC_A(*s, FLAGS(c))
1888         );
1889         break;
1890
1891     case AHOCORASICKC:
1892     case AHOCORASICK:
1893         {
1894             DECL_TRIE_TYPE(c);
1895             /* what trie are we using right now */
1896             reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1897             reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1898             HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1899
1900             const char *last_start = strend - trie->minlen;
1901 #ifdef DEBUGGING
1902             const char *real_start = s;
1903 #endif
1904             STRLEN maxlen = trie->maxlen;
1905             SV *sv_points;
1906             U8 **points; /* map of where we were in the input string
1907                             when reading a given char. For ASCII this
1908                             is unnecessary overhead as the relationship
1909                             is always 1:1, but for Unicode, especially
1910                             case folded Unicode this is not true. */
1911             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1912             U8 *bitmap=NULL;
1913
1914
1915             GET_RE_DEBUG_FLAGS_DECL;
1916
1917             /* We can't just allocate points here. We need to wrap it in
1918              * an SV so it gets freed properly if there is a croak while
1919              * running the match */
1920             ENTER;
1921             SAVETMPS;
1922             sv_points=newSV(maxlen * sizeof(U8 *));
1923             SvCUR_set(sv_points,
1924                 maxlen * sizeof(U8 *));
1925             SvPOK_on(sv_points);
1926             sv_2mortal(sv_points);
1927             points=(U8**)SvPV_nolen(sv_points );
1928             if ( trie_type != trie_utf8_fold
1929                  && (trie->bitmap || OP(c)==AHOCORASICKC) )
1930             {
1931                 if (trie->bitmap)
1932                     bitmap=(U8*)trie->bitmap;
1933                 else
1934                     bitmap=(U8*)ANYOF_BITMAP(c);
1935             }
1936             /* this is the Aho-Corasick algorithm modified a touch
1937                to include special handling for long "unknown char" sequences.
1938                The basic idea being that we use AC as long as we are dealing
1939                with a possible matching char, when we encounter an unknown char
1940                (and we have not encountered an accepting state) we scan forward
1941                until we find a legal starting char.
1942                AC matching is basically that of trie matching, except that when
1943                we encounter a failing transition, we fall back to the current
1944                states "fail state", and try the current char again, a process
1945                we repeat until we reach the root state, state 1, or a legal
1946                transition. If we fail on the root state then we can either
1947                terminate if we have reached an accepting state previously, or
1948                restart the entire process from the beginning if we have not.
1949
1950              */
1951             while (s <= last_start) {
1952                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1953                 U8 *uc = (U8*)s;
1954                 U16 charid = 0;
1955                 U32 base = 1;
1956                 U32 state = 1;
1957                 UV uvc = 0;
1958                 STRLEN len = 0;
1959                 STRLEN foldlen = 0;
1960                 U8 *uscan = (U8*)NULL;
1961                 U8 *leftmost = NULL;
1962 #ifdef DEBUGGING
1963                 U32 accepted_word= 0;
1964 #endif
1965                 U32 pointpos = 0;
1966
1967                 while ( state && uc <= (U8*)strend ) {
1968                     int failed=0;
1969                     U32 word = aho->states[ state ].wordnum;
1970
1971                     if( state==1 ) {
1972                         if ( bitmap ) {
1973                             DEBUG_TRIE_EXECUTE_r(
1974                                 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1975                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1976                                         (char *)uc, utf8_target );
1977                                     PerlIO_printf( Perl_debug_log,
1978                                         " Scanning for legal start char...\n");
1979                                 }
1980                             );
1981                             if (utf8_target) {
1982                                 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1983                                     uc += UTF8SKIP(uc);
1984                                 }
1985                             } else {
1986                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1987                                     uc++;
1988                                 }
1989                             }
1990                             s= (char *)uc;
1991                         }
1992                         if (uc >(U8*)last_start) break;
1993                     }
1994
1995                     if ( word ) {
1996                         U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1997                         if (!leftmost || lpos < leftmost) {
1998                             DEBUG_r(accepted_word=word);
1999                             leftmost= lpos;
2000                         }
2001                         if (base==0) break;
2002
2003                     }
2004                     points[pointpos++ % maxlen]= uc;
2005                     if (foldlen || uc < (U8*)strend) {
2006                         REXEC_TRIE_READ_CHAR(trie_type, trie,
2007                                          widecharmap, uc,
2008                                          uscan, len, uvc, charid, foldlen,
2009                                          foldbuf, uniflags);
2010                         DEBUG_TRIE_EXECUTE_r({
2011                             dump_exec_pos( (char *)uc, c, strend,
2012                                         real_start, s, utf8_target);
2013                             PerlIO_printf(Perl_debug_log,
2014                                 " Charid:%3u CP:%4"UVxf" ",
2015                                  charid, uvc);
2016                         });
2017                     }
2018                     else {
2019                         len = 0;
2020                         charid = 0;
2021                     }
2022
2023
2024                     do {
2025 #ifdef DEBUGGING
2026                         word = aho->states[ state ].wordnum;
2027 #endif
2028                         base = aho->states[ state ].trans.base;
2029
2030                         DEBUG_TRIE_EXECUTE_r({
2031                             if (failed)
2032                                 dump_exec_pos( (char *)uc, c, strend, real_start,
2033                                     s,   utf8_target );
2034                             PerlIO_printf( Perl_debug_log,
2035                                 "%sState: %4"UVxf", word=%"UVxf,
2036                                 failed ? " Fail transition to " : "",
2037                                 (UV)state, (UV)word);
2038                         });
2039                         if ( base ) {
2040                             U32 tmp;
2041                             I32 offset;
2042                             if (charid &&
2043                                  ( ((offset = base + charid
2044                                     - 1 - trie->uniquecharcount)) >= 0)
2045                                  && ((U32)offset < trie->lasttrans)
2046                                  && trie->trans[offset].check == state
2047                                  && (tmp=trie->trans[offset].next))
2048                             {
2049                                 DEBUG_TRIE_EXECUTE_r(
2050                                     PerlIO_printf( Perl_debug_log," - legal\n"));
2051                                 state = tmp;
2052                                 break;
2053                             }
2054                             else {
2055                                 DEBUG_TRIE_EXECUTE_r(
2056                                     PerlIO_printf( Perl_debug_log," - fail\n"));
2057                                 failed = 1;
2058                                 state = aho->fail[state];
2059                             }
2060                         }
2061                         else {
2062                             /* we must be accepting here */
2063                             DEBUG_TRIE_EXECUTE_r(
2064                                     PerlIO_printf( Perl_debug_log," - accepting\n"));
2065                             failed = 1;
2066                             break;
2067                         }
2068                     } while(state);
2069                     uc += len;
2070                     if (failed) {
2071                         if (leftmost)
2072                             break;
2073                         if (!state) state = 1;
2074                     }
2075                 }
2076                 if ( aho->states[ state ].wordnum ) {
2077                     U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2078                     if (!leftmost || lpos < leftmost) {
2079                         DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2080                         leftmost = lpos;
2081                     }
2082                 }
2083                 if (leftmost) {
2084                     s = (char*)leftmost;
2085                     DEBUG_TRIE_EXECUTE_r({
2086                         PerlIO_printf(
2087                             Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2088                             (UV)accepted_word, (IV)(s - real_start)
2089                         );
2090                     });
2091                     if (!reginfo || regtry(reginfo, &s)) {
2092                         FREETMPS;
2093                         LEAVE;
2094                         goto got_it;
2095                     }
2096                     s = HOPc(s,1);
2097                     DEBUG_TRIE_EXECUTE_r({
2098                         PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2099                     });
2100                 } else {
2101                     DEBUG_TRIE_EXECUTE_r(
2102                         PerlIO_printf( Perl_debug_log,"No match.\n"));
2103                     break;
2104                 }
2105             }
2106             FREETMPS;
2107             LEAVE;
2108         }
2109         break;
2110     default:
2111         Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2112         break;
2113     }
2114     return 0;
2115   got_it:
2116     return s;
2117 }
2118
2119
2120 /*
2121  - regexec_flags - match a regexp against a string
2122  */
2123 I32
2124 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2125               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2126 /* stringarg: the point in the string at which to begin matching */
2127 /* strend:    pointer to null at end of string */
2128 /* strbeg:    real beginning of string */
2129 /* minend:    end of match must be >= minend bytes after stringarg. */
2130 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2131  *            itself is accessed via the pointers above */
2132 /* data:      May be used for some additional optimizations.
2133               Currently its only used, with a U32 cast, for transmitting
2134               the ganch offset when doing a /g match. This will change */
2135 /* nosave:    For optimizations. */
2136
2137 {
2138     dVAR;
2139     struct regexp *const prog = ReANY(rx);
2140     char *s;
2141     regnode *c;
2142     char *startpos = stringarg;
2143     I32 minlen;         /* must match at least this many chars */
2144     I32 dontbother = 0; /* how many characters not to try at end */
2145     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
2146     I32 scream_pos = -1;                /* Internal iterator of scream. */
2147     char *scream_olds = NULL;
2148     const bool utf8_target = cBOOL(DO_UTF8(sv));
2149     I32 multiline;
2150     RXi_GET_DECL(prog,progi);
2151     regmatch_info reginfo;  /* create some info to pass to regtry etc */
2152     regexp_paren_pair *swap = NULL;
2153     GET_RE_DEBUG_FLAGS_DECL;
2154
2155     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2156     PERL_UNUSED_ARG(data);
2157
2158     /* Be paranoid... */
2159     if (prog == NULL || startpos == NULL) {
2160         Perl_croak(aTHX_ "NULL regexp parameter");
2161         return 0;
2162     }
2163
2164     multiline = prog->extflags & RXf_PMf_MULTILINE;
2165     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
2166
2167     RX_MATCH_UTF8_set(rx, utf8_target);
2168     DEBUG_EXECUTE_r(
2169         debug_start_match(rx, utf8_target, startpos, strend,
2170         "Matching");
2171     );
2172
2173     minlen = prog->minlen;
2174
2175     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2176         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2177                               "String too short [regexec_flags]...\n"));
2178         goto phooey;
2179     }
2180
2181
2182     /* Check validity of program. */
2183     if (UCHARAT(progi->program) != REG_MAGIC) {
2184         Perl_croak(aTHX_ "corrupted regexp program");
2185     }
2186
2187     PL_reg_flags = 0;
2188     PL_reg_state.re_state_eval_setup_done = FALSE;
2189     PL_reg_maxiter = 0;
2190
2191     if (RX_UTF8(rx))
2192         PL_reg_flags |= RF_utf8;
2193
2194     /* Mark beginning of line for ^ and lookbehind. */
2195     reginfo.bol = startpos; /* XXX not used ??? */
2196     PL_bostr  = strbeg;
2197     reginfo.sv = sv;
2198
2199     /* Mark end of line for $ (and such) */
2200     PL_regeol = strend;
2201
2202     /* see how far we have to get to not match where we matched before */
2203     reginfo.till = startpos+minend;
2204
2205     /* If there is a "must appear" string, look for it. */
2206     s = startpos;
2207
2208     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2209         MAGIC *mg;
2210         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2211             reginfo.ganch = startpos + prog->gofs;
2212             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2213               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2214         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2215                   && SvMAGIC(sv)
2216                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2217                   && mg->mg_len >= 0) {
2218             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2219             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2220                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2221
2222             if (prog->extflags & RXf_ANCH_GPOS) {
2223                 if (s > reginfo.ganch)
2224                     goto phooey;
2225                 s = reginfo.ganch - prog->gofs;
2226                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2227                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2228                 if (s < strbeg)
2229                     goto phooey;
2230             }
2231         }
2232         else if (data) {
2233             reginfo.ganch = strbeg + PTR2UV(data);
2234             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2235                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2236
2237         } else {                                /* pos() not defined */
2238             reginfo.ganch = strbeg;
2239             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2240                  "GPOS: reginfo.ganch = strbeg\n"));
2241         }
2242     }
2243     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2244         /* We have to be careful. If the previous successful match
2245            was from this regex we don't want a subsequent partially
2246            successful match to clobber the old results.
2247            So when we detect this possibility we add a swap buffer
2248            to the re, and switch the buffer each match. If we fail
2249            we switch it back, otherwise we leave it swapped.
2250         */
2251         swap = prog->offs;
2252         /* do we need a save destructor here for eval dies? */
2253         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2254         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2255             "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2256             PTR2UV(prog),
2257             PTR2UV(swap),
2258             PTR2UV(prog->offs)
2259         ));
2260     }
2261     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2262         re_scream_pos_data d;
2263
2264         d.scream_olds = &scream_olds;
2265         d.scream_pos = &scream_pos;
2266         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2267         if (!s) {
2268             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2269             goto phooey;        /* not present */
2270         }
2271     }
2272
2273
2274
2275     /* Simplest case:  anchored match need be tried only once. */
2276     /*  [unless only anchor is BOL and multiline is set] */
2277     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2278         if (s == startpos && regtry(&reginfo, &startpos))
2279             goto got_it;
2280         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2281                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2282         {
2283             char *end;
2284
2285             if (minlen)
2286                 dontbother = minlen - 1;
2287             end = HOP3c(strend, -dontbother, strbeg) - 1;
2288             /* for multiline we only have to try after newlines */
2289             if (prog->check_substr || prog->check_utf8) {
2290                 /* because of the goto we can not easily reuse the macros for bifurcating the
2291                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2292                 if (utf8_target) {
2293                     if (s == startpos)
2294                         goto after_try_utf8;
2295                     while (1) {
2296                         if (regtry(&reginfo, &s)) {
2297                             goto got_it;
2298                         }
2299                       after_try_utf8:
2300                         if (s > end) {
2301                             goto phooey;
2302                         }
2303                         if (prog->extflags & RXf_USE_INTUIT) {
2304                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2305                             if (!s) {
2306                                 goto phooey;
2307                             }
2308                         }
2309                         else {
2310                             s += UTF8SKIP(s);
2311                         }
2312                     }
2313                 } /* end search for check string in unicode */
2314                 else {
2315                     if (s == startpos) {
2316                         goto after_try_latin;
2317                     }
2318                     while (1) {
2319                         if (regtry(&reginfo, &s)) {
2320                             goto got_it;
2321                         }
2322                       after_try_latin:
2323                         if (s > end) {
2324                             goto phooey;
2325                         }
2326                         if (prog->extflags & RXf_USE_INTUIT) {
2327                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2328                             if (!s) {
2329                                 goto phooey;
2330                             }
2331                         }
2332                         else {
2333                             s++;
2334                         }
2335                     }
2336                 } /* end search for check string in latin*/
2337             } /* end search for check string */
2338             else { /* search for newline */
2339                 if (s > startpos) {
2340                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2341                     s--;
2342                 }
2343                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2344                 while (s <= end) { /* note it could be possible to match at the end of the string */
2345                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2346                         if (regtry(&reginfo, &s))
2347                             goto got_it;
2348                     }
2349                 }
2350             } /* end search for newline */
2351         } /* end anchored/multiline check string search */
2352         goto phooey;
2353     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2354     {
2355         /* the warning about reginfo.ganch being used without initialization
2356            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2357            and we only enter this block when the same bit is set. */
2358         char *tmp_s = reginfo.ganch - prog->gofs;
2359
2360         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2361             goto got_it;
2362         goto phooey;
2363     }
2364
2365     /* Messy cases:  unanchored match. */
2366     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2367         /* we have /x+whatever/ */
2368         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2369         char ch;
2370 #ifdef DEBUGGING
2371         int did_match = 0;
2372 #endif
2373         if (utf8_target) {
2374             if (! prog->anchored_utf8) {
2375                 to_utf8_substr(prog);
2376             }
2377             ch = SvPVX_const(prog->anchored_utf8)[0];
2378             REXEC_FBC_SCAN(
2379                 if (*s == ch) {
2380                     DEBUG_EXECUTE_r( did_match = 1 );
2381                     if (regtry(&reginfo, &s)) goto got_it;
2382                     s += UTF8SKIP(s);
2383                     while (s < strend && *s == ch)
2384                         s += UTF8SKIP(s);
2385                 }
2386             );
2387
2388         }
2389         else {
2390             if (! prog->anchored_substr) {
2391                 if (! to_byte_substr(prog)) {
2392                     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2393                 }
2394             }
2395             ch = SvPVX_const(prog->anchored_substr)[0];
2396             REXEC_FBC_SCAN(
2397                 if (*s == ch) {
2398                     DEBUG_EXECUTE_r( did_match = 1 );
2399                     if (regtry(&reginfo, &s)) goto got_it;
2400                     s++;
2401                     while (s < strend && *s == ch)
2402                         s++;
2403                 }
2404             );
2405         }
2406         DEBUG_EXECUTE_r(if (!did_match)
2407                 PerlIO_printf(Perl_debug_log,
2408                                   "Did not find anchored character...\n")
2409                );
2410     }
2411     else if (prog->anchored_substr != NULL
2412               || prog->anchored_utf8 != NULL
2413               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2414                   && prog->float_max_offset < strend - s)) {
2415         SV *must;
2416         I32 back_max;
2417         I32 back_min;
2418         char *last;
2419         char *last1;            /* Last position checked before */
2420 #ifdef DEBUGGING
2421         int did_match = 0;
2422 #endif
2423         if (prog->anchored_substr || prog->anchored_utf8) {
2424             if (utf8_target) {
2425                 if (! prog->anchored_utf8) {
2426                     to_utf8_substr(prog);
2427                 }
2428                 must = prog->anchored_utf8;
2429             }
2430             else {
2431                 if (! prog->anchored_substr) {
2432                     if (! to_byte_substr(prog)) {
2433                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2434                     }
2435                 }
2436                 must = prog->anchored_substr;
2437             }
2438             back_max = back_min = prog->anchored_offset;
2439         } else {
2440             if (utf8_target) {
2441                 if (! prog->float_utf8) {
2442                     to_utf8_substr(prog);
2443                 }
2444                 must = prog->float_utf8;
2445             }
2446             else {
2447                 if (! prog->float_substr) {
2448                     if (! to_byte_substr(prog)) {
2449                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2450                     }
2451                 }
2452                 must = prog->float_substr;
2453             }
2454             back_max = prog->float_max_offset;
2455             back_min = prog->float_min_offset;
2456         }
2457
2458         if (back_min<0) {
2459             last = strend;
2460         } else {
2461             last = HOP3c(strend,        /* Cannot start after this */
2462                   -(I32)(CHR_SVLEN(must)
2463                          - (SvTAIL(must) != 0) + back_min), strbeg);
2464         }
2465         if (s > PL_bostr)
2466             last1 = HOPc(s, -1);
2467         else
2468             last1 = s - 1;      /* bogus */
2469
2470         /* XXXX check_substr already used to find "s", can optimize if
2471            check_substr==must. */
2472         scream_pos = -1;
2473         dontbother = end_shift;
2474         strend = HOPc(strend, -dontbother);
2475         while ( (s <= last) &&
2476                 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2477                                   (unsigned char*)strend, must,
2478                                   multiline ? FBMrf_MULTILINE : 0)) ) {
2479             DEBUG_EXECUTE_r( did_match = 1 );
2480             if (HOPc(s, -back_max) > last1) {
2481                 last1 = HOPc(s, -back_min);
2482                 s = HOPc(s, -back_max);
2483             }
2484             else {
2485                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2486
2487                 last1 = HOPc(s, -back_min);
2488                 s = t;
2489             }
2490             if (utf8_target) {
2491                 while (s <= last1) {
2492                     if (regtry(&reginfo, &s))
2493                         goto got_it;
2494                     if (s >= last1) {
2495                         s++; /* to break out of outer loop */
2496                         break;
2497                     }
2498                     s += UTF8SKIP(s);
2499                 }
2500             }
2501             else {
2502                 while (s <= last1) {
2503                     if (regtry(&reginfo, &s))
2504                         goto got_it;
2505                     s++;
2506                 }
2507             }
2508         }
2509         DEBUG_EXECUTE_r(if (!did_match) {
2510             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2511                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2512             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2513                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2514                                ? "anchored" : "floating"),
2515                 quoted, RE_SV_TAIL(must));
2516         });
2517         goto phooey;
2518     }
2519     else if ( (c = progi->regstclass) ) {
2520         if (minlen) {
2521             const OPCODE op = OP(progi->regstclass);
2522             /* don't bother with what can't match */
2523             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2524                 strend = HOPc(strend, -(minlen - 1));
2525         }
2526         DEBUG_EXECUTE_r({
2527             SV * const prop = sv_newmortal();
2528             regprop(prog, prop, c);
2529             {
2530                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2531                     s,strend-s,60);
2532                 PerlIO_printf(Perl_debug_log,
2533                     "Matching stclass %.*s against %s (%d bytes)\n",
2534                     (int)SvCUR(prop), SvPVX_const(prop),
2535                      quoted, (int)(strend - s));
2536             }
2537         });
2538         if (find_byclass(prog, c, s, strend, &reginfo))
2539             goto got_it;
2540         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2541     }
2542     else {
2543         dontbother = 0;
2544         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2545             /* Trim the end. */
2546             char *last= NULL;
2547             SV* float_real;
2548             STRLEN len;
2549             const char *little;
2550
2551             if (utf8_target) {
2552                 if (! prog->float_utf8) {
2553                     to_utf8_substr(prog);
2554                 }
2555                 float_real = prog->float_utf8;
2556             }
2557             else {
2558                 if (! prog->float_substr) {
2559                     if (! to_byte_substr(prog)) {
2560                         NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2561                     }
2562                 }
2563                 float_real = prog->float_substr;
2564             }
2565
2566             little = SvPV_const(float_real, len);
2567             if (SvTAIL(float_real)) {
2568                     /* This means that float_real contains an artificial \n on
2569                      * the end due to the presence of something like this:
2570                      * /foo$/ where we can match both "foo" and "foo\n" at the
2571                      * end of the string.  So we have to compare the end of the
2572                      * string first against the float_real without the \n and
2573                      * then against the full float_real with the string.  We
2574                      * have to watch out for cases where the string might be
2575                      * smaller than the float_real or the float_real without
2576                      * the \n. */
2577                     char *checkpos= strend - len;
2578                     DEBUG_OPTIMISE_r(
2579                         PerlIO_printf(Perl_debug_log,
2580                             "%sChecking for float_real.%s\n",
2581                             PL_colors[4], PL_colors[5]));
2582                     if (checkpos + 1 < strbeg) {
2583                         /* can't match, even if we remove the trailing \n
2584                          * string is too short to match */
2585                         DEBUG_EXECUTE_r(
2586                             PerlIO_printf(Perl_debug_log,
2587                                 "%sString shorter than required trailing substring, cannot match.%s\n",
2588                                 PL_colors[4], PL_colors[5]));
2589                         goto phooey;
2590                     } else if (memEQ(checkpos + 1, little, len - 1)) {
2591                         /* can match, the end of the string matches without the
2592                          * "\n" */
2593                         last = checkpos + 1;
2594                     } else if (checkpos < strbeg) {
2595                         /* cant match, string is too short when the "\n" is
2596                          * included */
2597                         DEBUG_EXECUTE_r(
2598                             PerlIO_printf(Perl_debug_log,
2599                                 "%sString does not contain required trailing substring, cannot match.%s\n",
2600                                 PL_colors[4], PL_colors[5]));
2601                         goto phooey;
2602                     } else if (!multiline) {
2603                         /* non multiline match, so compare with the "\n" at the
2604                          * end of the string */
2605                         if (memEQ(checkpos, little, len)) {
2606                             last= checkpos;
2607                         } else {
2608                             DEBUG_EXECUTE_r(
2609                                 PerlIO_printf(Perl_debug_log,
2610                                     "%sString does not contain required trailing substring, cannot match.%s\n",
2611                                     PL_colors[4], PL_colors[5]));
2612                             goto phooey;
2613                         }
2614                     } else {
2615                         /* multiline match, so we have to search for a place
2616                          * where the full string is located */
2617                         goto find_last;
2618                     }
2619             } else {
2620                   find_last:
2621                     if (len)
2622                         last = rninstr(s, strend, little, little + len);
2623                     else
2624                         last = strend;  /* matching "$" */
2625             }
2626             if (!last) {
2627                 /* at one point this block contained a comment which was
2628                  * probably incorrect, which said that this was a "should not
2629                  * happen" case.  Even if it was true when it was written I am
2630                  * pretty sure it is not anymore, so I have removed the comment
2631                  * and replaced it with this one. Yves */
2632                 DEBUG_EXECUTE_r(
2633                     PerlIO_printf(Perl_debug_log,
2634                         "String does not contain required substring, cannot match.\n"
2635                     ));
2636                 goto phooey;
2637             }
2638             dontbother = strend - last + prog->float_min_offset;
2639         }
2640         if (minlen && (dontbother < minlen))
2641             dontbother = minlen - 1;
2642         strend -= dontbother;              /* this one's always in bytes! */
2643         /* We don't know much -- general case. */
2644         if (utf8_target) {
2645             for (;;) {
2646                 if (regtry(&reginfo, &s))
2647                     goto got_it;
2648                 if (s >= strend)
2649                     break;
2650                 s += UTF8SKIP(s);
2651             };
2652         }
2653         else {
2654             do {
2655                 if (regtry(&reginfo, &s))
2656                     goto got_it;
2657             } while (s++ < strend);
2658         }
2659     }
2660
2661     /* Failure. */
2662     goto phooey;
2663
2664 got_it:
2665     DEBUG_BUFFERS_r(
2666         if (swap)
2667             PerlIO_printf(Perl_debug_log,
2668                 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2669                 PTR2UV(prog),
2670                 PTR2UV(swap)
2671             );
2672     );
2673     Safefree(swap);
2674     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2675
2676     if (PL_reg_state.re_state_eval_setup_done)
2677         restore_pos(aTHX_ prog);
2678     if (RXp_PAREN_NAMES(prog))
2679         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2680
2681     /* make sure $`, $&, $', and $digit will work later */
2682     if ( !(flags & REXEC_NOT_FIRST) ) {
2683         if (flags & REXEC_COPY_STR) {
2684 #ifdef PERL_ANY_COW
2685             if (SvCANCOW(sv)) {
2686                 if (DEBUG_C_TEST) {
2687                     PerlIO_printf(Perl_debug_log,
2688                                   "Copy on write: regexp capture, type %d\n",
2689                                   (int) SvTYPE(sv));
2690                 }
2691                 RX_MATCH_COPY_FREE(rx);
2692                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2693                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2694                 assert (SvPOKp(prog->saved_copy));
2695                 prog->sublen  = PL_regeol - strbeg;
2696                 prog->suboffset = 0;
2697                 prog->subcoffset = 0;
2698             } else
2699 #endif
2700             {
2701                 I32 min = 0;
2702                 I32 max = PL_regeol - strbeg;
2703                 I32 sublen;
2704
2705                 if (    (flags & REXEC_COPY_SKIP_POST)
2706                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2707                     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2708                 ) { /* don't copy $' part of string */
2709                     U32 n = 0;
2710                     max = -1;
2711                     /* calculate the right-most part of the string covered
2712                      * by a capture. Due to look-ahead, this may be to
2713                      * the right of $&, so we have to scan all captures */
2714                     while (n <= prog->lastparen) {
2715                         if (prog->offs[n].end > max)
2716                             max = prog->offs[n].end;
2717                         n++;
2718                     }
2719                     if (max == -1)
2720                         max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2721                                 ? prog->offs[0].start
2722                                 : 0;
2723                     assert(max >= 0 && max <= PL_regeol - strbeg);
2724                 }
2725
2726                 if (    (flags & REXEC_COPY_SKIP_PRE)
2727                     && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2728                     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2729                 ) { /* don't copy $` part of string */
2730                     U32 n = 0;
2731                     min = max;
2732                     /* calculate the left-most part of the string covered
2733                      * by a capture. Due to look-behind, this may be to
2734                      * the left of $&, so we have to scan all captures */
2735                     while (min && n <= prog->lastparen) {
2736                         if (   prog->offs[n].start != -1
2737                             && prog->offs[n].start < min)
2738                         {
2739                             min = prog->offs[n].start;
2740                         }
2741                         n++;
2742                     }
2743                     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2744                         && min >  prog->offs[0].end
2745                     )
2746                         min = prog->offs[0].end;
2747
2748                 }
2749
2750                 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2751                 sublen = max - min;
2752
2753                 if (RX_MATCH_COPIED(rx)) {
2754                     if (sublen > prog->sublen)
2755                         prog->subbeg =
2756                                 (char*)saferealloc(prog->subbeg, sublen+1);
2757                 }
2758                 else
2759                     prog->subbeg = (char*)safemalloc(sublen+1);
2760                 Copy(strbeg + min, prog->subbeg, sublen, char);
2761                 prog->subbeg[sublen] = '\0';
2762                 prog->suboffset = min;
2763                 prog->sublen = sublen;
2764                 RX_MATCH_COPIED_on(rx);
2765             }
2766             prog->subcoffset = prog->suboffset;
2767             if (prog->suboffset && utf8_target) {
2768                 /* Convert byte offset to chars.
2769                  * XXX ideally should only compute this if @-/@+
2770                  * has been seen, a la PL_sawampersand ??? */
2771
2772                 /* If there's a direct correspondence between the
2773                  * string which we're matching and the original SV,
2774                  * then we can use the utf8 len cache associated with
2775                  * the SV. In particular, it means that under //g,
2776                  * sv_pos_b2u() will use the previously cached
2777                  * position to speed up working out the new length of
2778                  * subcoffset, rather than counting from the start of
2779                  * the string each time. This stops
2780                  *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2781                  * from going quadratic */
2782                 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2783                     sv_pos_b2u(sv, &(prog->subcoffset));
2784                 else
2785                     prog->subcoffset = utf8_length((U8*)strbeg,
2786                                         (U8*)(strbeg+prog->suboffset));
2787             }
2788         }
2789         else {
2790             RX_MATCH_COPY_FREE(rx);
2791             prog->subbeg = strbeg;
2792             prog->suboffset = 0;
2793             prog->subcoffset = 0;
2794             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2795         }
2796     }
2797
2798     return 1;
2799
2800 phooey:
2801     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2802                           PL_colors[4], PL_colors[5]));
2803     if (PL_reg_state.re_state_eval_setup_done)
2804         restore_pos(aTHX_ prog);
2805     if (swap) {
2806         /* we failed :-( roll it back */
2807         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2808             "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2809             PTR2UV(prog),
2810             PTR2UV(prog->offs),
2811             PTR2UV(swap)
2812         ));
2813         Safefree(prog->offs);
2814         prog->offs = swap;
2815     }
2816     return 0;
2817 }
2818
2819
2820 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2821  * Do inc before dec, in case old and new rex are the same */
2822 #define SET_reg_curpm(Re2) \
2823     if (PL_reg_state.re_state_eval_setup_done) {    \
2824         (void)ReREFCNT_inc(Re2);                    \
2825         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));       \
2826         PM_SETRE((PL_reg_curpm), (Re2));            \
2827     }
2828
2829
2830 /*
2831  - regtry - try match at specific point
2832  */
2833 STATIC I32                      /* 0 failure, 1 success */
2834 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2835 {
2836     dVAR;
2837     CHECKPOINT lastcp;
2838     REGEXP *const rx = reginfo->prog;
2839     regexp *const prog = ReANY(rx);
2840     I32 result;
2841     RXi_GET_DECL(prog,progi);
2842     GET_RE_DEBUG_FLAGS_DECL;
2843
2844     PERL_ARGS_ASSERT_REGTRY;
2845
2846     reginfo->cutpoint=NULL;
2847
2848     if ((prog->extflags & RXf_EVAL_SEEN)
2849         && !PL_reg_state.re_state_eval_setup_done)
2850     {
2851         MAGIC *mg;
2852
2853         PL_reg_state.re_state_eval_setup_done = TRUE;
2854         if (reginfo->sv) {
2855             /* Make $_ available to executed code. */
2856             if (reginfo->sv != DEFSV) {
2857                 SAVE_DEFSV;
2858                 DEFSV_set(reginfo->sv);
2859             }
2860
2861             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2862                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2863                 /* prepare for quick setting of pos */
2864 #ifdef PERL_OLD_COPY_ON_WRITE
2865                 if (SvIsCOW(reginfo->sv))
2866                     sv_force_normal_flags(reginfo->sv, 0);
2867 #endif
2868                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2869                                  &PL_vtbl_mglob, NULL, 0);
2870                 mg->mg_len = -1;
2871             }
2872             PL_reg_magic    = mg;
2873             PL_reg_oldpos   = mg->mg_len;
2874             SAVEDESTRUCTOR_X(restore_pos, prog);
2875         }
2876         if (!PL_reg_curpm) {
2877             Newxz(PL_reg_curpm, 1, PMOP);
2878 #ifdef USE_ITHREADS
2879             {
2880                 SV* const repointer = &PL_sv_undef;
2881                 /* this regexp is also owned by the new PL_reg_curpm, which
2882                    will try to free it.  */
2883                 av_push(PL_regex_padav, repointer);
2884                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2885                 PL_regex_pad = AvARRAY(PL_regex_padav);
2886             }
2887 #endif
2888         }
2889         SET_reg_curpm(rx);
2890         PL_reg_oldcurpm = PL_curpm;
2891         PL_curpm = PL_reg_curpm;
2892         if (RXp_MATCH_COPIED(prog)) {
2893             /*  Here is a serious problem: we cannot rewrite subbeg,
2894                 since it may be needed if this match fails.  Thus
2895                 $` inside (?{}) could fail... */
2896             PL_reg_oldsaved = prog->subbeg;
2897             PL_reg_oldsavedlen = prog->sublen;
2898             PL_reg_oldsavedoffset = prog->suboffset;
2899             PL_reg_oldsavedcoffset = prog->suboffset;
2900 #ifdef PERL_ANY_COW
2901             PL_nrs = prog->saved_copy;
2902 #endif
2903             RXp_MATCH_COPIED_off(prog);
2904         }
2905         else
2906             PL_reg_oldsaved = NULL;
2907         prog->subbeg = PL_bostr;
2908         prog->suboffset = 0;
2909         prog->subcoffset = 0;
2910         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2911     }
2912 #ifdef DEBUGGING
2913     PL_reg_starttry = *startposp;
2914 #endif
2915     prog->offs[0].start = *startposp - PL_bostr;
2916     prog->lastparen = 0;
2917     prog->lastcloseparen = 0;
2918
2919     /* XXXX What this code is doing here?!!!  There should be no need
2920        to do this again and again, prog->lastparen should take care of
2921        this!  --ilya*/
2922
2923     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2924      * Actually, the code in regcppop() (which Ilya may be meaning by
2925      * prog->lastparen), is not needed at all by the test suite
2926      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2927      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2928      * Meanwhile, this code *is* needed for the
2929      * above-mentioned test suite tests to succeed.  The common theme
2930      * on those tests seems to be returning null fields from matches.
2931      * --jhi updated by dapm */
2932 #if 1
2933     if (prog->nparens) {
2934         regexp_paren_pair *pp = prog->offs;
2935         I32 i;
2936         for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2937             ++pp;
2938             pp->start = -1;
2939             pp->end = -1;
2940         }
2941     }
2942 #endif
2943     REGCP_SET(lastcp);
2944     result = regmatch(reginfo, *startposp, progi->program + 1);
2945     if (result != -1) {
2946         prog->offs[0].end = result;
2947         return 1;
2948     }
2949     if (reginfo->cutpoint)
2950         *startposp= reginfo->cutpoint;
2951     REGCP_UNWIND(lastcp);
2952     return 0;
2953 }
2954
2955
2956 #define sayYES goto yes
2957 #define sayNO goto no
2958 #define sayNO_SILENT goto no_silent
2959
2960 /* we dont use STMT_START/END here because it leads to
2961    "unreachable code" warnings, which are bogus, but distracting. */
2962 #define CACHEsayNO \
2963     if (ST.cache_mask) \
2964        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2965     sayNO
2966
2967 /* this is used to determine how far from the left messages like
2968    'failed...' are printed. It should be set such that messages
2969    are inline with the regop output that created them.
2970 */
2971 #define REPORT_CODE_OFF 32
2972
2973
2974 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2975 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2976 #define CHRTEST_NOT_A_CP_1 -999
2977 #define CHRTEST_NOT_A_CP_2 -998
2978
2979 #define SLAB_FIRST(s) (&(s)->states[0])
2980 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2981
2982 /* grab a new slab and return the first slot in it */
2983
2984 STATIC regmatch_state *
2985 S_push_slab(pTHX)
2986 {
2987 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2988     dMY_CXT;
2989 #endif
2990     regmatch_slab *s = PL_regmatch_slab->next;
2991     if (!s) {
2992         Newx(s, 1, regmatch_slab);
2993         s->prev = PL_regmatch_slab;
2994         s->next = NULL;
2995         PL_regmatch_slab->next = s;
2996     }
2997     PL_regmatch_slab = s;
2998     return SLAB_FIRST(s);
2999 }
3000
3001
3002 /* push a new state then goto it */
3003
3004 #define PUSH_STATE_GOTO(state, node, input) \
3005     pushinput = input; \
3006     scan = node; \
3007     st->resume_state = state; \
3008     goto push_state;
3009
3010 /* push a new state with success backtracking, then goto it */
3011
3012 #define PUSH_YES_STATE_GOTO(state, node, input) \
3013     pushinput = input; \
3014     scan = node; \
3015     st->resume_state = state; \
3016     goto push_yes_state;
3017
3018
3019
3020
3021 /*
3022
3023 regmatch() - main matching routine
3024
3025 This is basically one big switch statement in a loop. We execute an op,
3026 set 'next' to point the next op, and continue. If we come to a point which
3027 we may need to backtrack to on failure such as (A|B|C), we push a
3028 backtrack state onto the backtrack stack. On failure, we pop the top
3029 state, and re-enter the loop at the state indicated. If there are no more
3030 states to pop, we return failure.
3031
3032 Sometimes we also need to backtrack on success; for example /A+/, where
3033 after successfully matching one A, we need to go back and try to
3034 match another one; similarly for lookahead assertions: if the assertion
3035 completes successfully, we backtrack to the state just before the assertion
3036 and then carry on.  In these cases, the pushed state is marked as
3037 'backtrack on success too'. This marking is in fact done by a chain of
3038 pointers, each pointing to the previous 'yes' state. On success, we pop to
3039 the nearest yes state, discarding any intermediate failure-only states.
3040 Sometimes a yes state is pushed just to force some cleanup code to be
3041 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3042 it to free the inner regex.
3043
3044 Note that failure backtracking rewinds the cursor position, while
3045 success backtracking leaves it alone.
3046
3047 A pattern is complete when the END op is executed, while a subpattern
3048 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3049 ops trigger the "pop to last yes state if any, otherwise return true"
3050 behaviour.
3051
3052 A common convention in this function is to use A and B to refer to the two
3053 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3054 the subpattern to be matched possibly multiple times, while B is the entire
3055 rest of the pattern. Variable and state names reflect this convention.
3056
3057 The states in the main switch are the union of ops and failure/success of
3058 substates associated with with that op.  For example, IFMATCH is the op
3059 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3060 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3061 successfully matched A and IFMATCH_A_fail is a state saying that we have
3062 just failed to match A. Resume states always come in pairs. The backtrack
3063 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3064 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3065 on success or failure.
3066
3067 The struct that holds a backtracking state is actually a big union, with
3068 one variant for each major type of op. The variable st points to the
3069 top-most backtrack struct. To make the code clearer, within each
3070 block of code we #define ST to alias the relevant union.
3071
3072 Here's a concrete example of a (vastly oversimplified) IFMATCH
3073 implementation:
3074
3075     switch (state) {
3076     ....
3077
3078 #define ST st->u.ifmatch
3079
3080     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3081         ST.foo = ...; // some state we wish to save
3082         ...
3083         // push a yes backtrack state with a resume value of
3084         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3085         // first node of A:
3086         PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3087         // NOTREACHED
3088
3089     case IFMATCH_A: // we have successfully executed A; now continue with B
3090         next = B;
3091         bar = ST.foo; // do something with the preserved value
3092         break;
3093
3094     case IFMATCH_A_fail: // A failed, so the assertion failed
3095         ...;   // do some housekeeping, then ...
3096         sayNO; // propagate the failure
3097
3098 #undef ST
3099
3100     ...
3101     }
3102
3103 For any old-timers reading this who are familiar with the old recursive
3104 approach, the code above is equivalent to:
3105
3106     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3107     {
3108         int foo = ...
3109         ...
3110         if (regmatch(A)) {
3111             next = B;
3112             bar = foo;
3113             break;
3114         }
3115         ...;   // do some housekeeping, then ...
3116         sayNO; // propagate the failure
3117     }
3118
3119 The topmost backtrack state, pointed to by st, is usually free. If you
3120 want to claim it, populate any ST.foo fields in it with values you wish to
3121 save, then do one of
3122
3123         PUSH_STATE_GOTO(resume_state, node, newinput);
3124         PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3125
3126 which sets that backtrack state's resume value to 'resume_state', pushes a
3127 new free entry to the top of the backtrack stack, then goes to 'node'.
3128 On backtracking, the free slot is popped, and the saved state becomes the
3129 new free state. An ST.foo field in this new top state can be temporarily
3130 accessed to retrieve values, but once the main loop is re-entered, it
3131 becomes available for reuse.
3132
3133 Note that the depth of the backtrack stack constantly increases during the
3134 left-to-right execution of the pattern, rather than going up and down with
3135 the pattern nesting. For example the stack is at its maximum at Z at the
3136 end of the pattern, rather than at X in the following:
3137
3138     /(((X)+)+)+....(Y)+....Z/
3139
3140 The only exceptions to this are lookahead/behind assertions and the cut,
3141 (?>A), which pop all the backtrack states associated with A before
3142 continuing.
3143
3144 Backtrack state structs are allocated in slabs of about 4K in size.
3145 PL_regmatch_state and st always point to the currently active state,
3146 and PL_regmatch_slab points to the slab currently containing
3147 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3148 allocated, and is never freed until interpreter destruction. When the slab
3149 is full, a new one is allocated and chained to the end. At exit from
3150 regmatch(), slabs allocated since entry are freed.
3151
3152 */
3153
3154
3155 #define DEBUG_STATE_pp(pp)                                  \
3156     DEBUG_STATE_r({                                         \
3157         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
3158         PerlIO_printf(Perl_debug_log,                       \
3159             "    %*s"pp" %s%s%s%s%s\n",                     \
3160             depth*2, "",                                    \
3161             PL_reg_name[st->resume_state],                     \
3162             ((st==yes_state||st==mark_state) ? "[" : ""),   \
3163             ((st==yes_state) ? "Y" : ""),                   \
3164             ((st==mark_state) ? "M" : ""),                  \
3165             ((st==yes_state||st==mark_state) ? "]" : "")    \
3166         );                                                  \
3167     });
3168
3169
3170 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3171
3172 #ifdef DEBUGGING
3173
3174 STATIC void
3175 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3176     const char *start, const char *end, const char *blurb)
3177 {
3178     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3179
3180     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3181
3182     if (!PL_colorset)
3183             reginitcolors();
3184     {
3185         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3186             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3187
3188         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3189             start, end - start, 60);
3190
3191         PerlIO_printf(Perl_debug_log,
3192             "%s%s REx%s %s against %s\n",
3193                        PL_colors[4], blurb, PL_colors[5], s0, s1);
3194
3195         if (utf8_target||utf8_pat)
3196             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3197                 utf8_pat ? "pattern" : "",
3198                 utf8_pat && utf8_target ? " and " : "",
3199                 utf8_target ? "string" : ""
3200             );
3201     }
3202 }
3203
3204 STATIC void
3205 S_dump_exec_pos(pTHX_ const char *locinput,
3206                       const regnode *scan,
3207                       const char *loc_regeol,
3208                       const char *loc_bostr,
3209                       const char *loc_reg_starttry,
3210                       const bool utf8_target)
3211 {
3212     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3213     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3214     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3215     /* The part of the string before starttry has one color
3216        (pref0_len chars), between starttry and current
3217        position another one (pref_len - pref0_len chars),
3218        after the current position the third one.
3219        We assume that pref0_len <= pref_len, otherwise we
3220        decrease pref0_len.  */
3221     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3222         ? (5 + taill) - l : locinput - loc_bostr;
3223     int pref0_len;
3224
3225     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3226
3227     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3228         pref_len++;
3229     pref0_len = pref_len  - (locinput - loc_reg_starttry);
3230     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3231         l = ( loc_regeol - locinput > (5 + taill) - pref_len
3232               ? (5 + taill) - pref_len : loc_regeol - locinput);
3233     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3234         l--;
3235     if (pref0_len < 0)
3236         pref0_len = 0;
3237     if (pref0_len > pref_len)
3238         pref0_len = pref_len;
3239     {
3240         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3241
3242         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3243             (locinput - pref_len),pref0_len, 60, 4, 5);
3244
3245         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3246                     (locinput - pref_len + pref0_len),
3247                     pref_len - pref0_len, 60, 2, 3);
3248
3249         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3250                     locinput, loc_regeol - locinput, 10, 0, 1);
3251
3252         const STRLEN tlen=len0+len1+len2;
3253         PerlIO_printf(Perl_debug_log,
3254                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3255                     (IV)(locinput - loc_bostr),
3256                     len0, s0,
3257                     len1, s1,
3258                     (docolor ? "" : "> <"),
3259                     len2, s2,
3260                     (int)(tlen > 19 ? 0 :  19 - tlen),
3261                     "");
3262     }
3263 }
3264
3265 #endif
3266
3267 /* reg_check_named_buff_matched()
3268  * Checks to see if a named buffer has matched. The data array of
3269  * buffer numbers corresponding to the buffer is expected to reside
3270  * in the regexp->data->data array in the slot stored in the ARG() of
3271  * node involved. Note that this routine doesn't actually care about the
3272  * name, that information is not preserved from compilation to execution.
3273  * Returns the index of the leftmost defined buffer with the given name
3274  * or 0 if non of the buffers matched.
3275  */
3276 STATIC I32
3277 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3278 {
3279     I32 n;
3280     RXi_GET_DECL(rex,rexi);
3281     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3282     I32 *nums=(I32*)SvPVX(sv_dat);
3283
3284     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3285
3286     for ( n=0; n<SvIVX(sv_dat); n++ ) {
3287         if ((I32)rex->lastparen >= nums[n] &&
3288             rex->offs[nums[n]].end != -1)
3289         {
3290             return nums[n];
3291         }
3292     }
3293     return 0;
3294 }
3295
3296
3297 /* free all slabs above current one  - called during LEAVE_SCOPE */
3298
3299 STATIC void
3300 S_clear_backtrack_stack(pTHX_ void *p)
3301 {
3302     regmatch_slab *s = PL_regmatch_slab->next;
3303     PERL_UNUSED_ARG(p);
3304
3305     if (!s)
3306         return;
3307     PL_regmatch_slab->next = NULL;
3308     while (s) {
3309         regmatch_slab * const osl = s;
3310         s = s->next;
3311         Safefree(osl);
3312     }
3313 }
3314 static bool
3315 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3316 {
3317     /* This function determines if there are one or two characters that match
3318      * the first character of the passed-in EXACTish node <text_node>, and if
3319      * so, returns them in the passed-in pointers.
3320      *
3321      * If it determines that no possible character in the target string can
3322      * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3323      * the first character in <text_node> requires UTF-8 to represent, and the
3324      * target string isn't in UTF-8.)
3325      *
3326      * If there are more than two characters that could match the beginning of
3327      * <text_node>, or if more context is required to determine a match or not,
3328      * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3329      *
3330      * The motiviation behind this function is to allow the caller to set up
3331      * tight loops for matching.  If <text_node> is of type EXACT, there is
3332      * only one possible character that can match its first character, and so
3333      * the situation is quite simple.  But things get much more complicated if
3334      * folding is involved.  It may be that the first character of an EXACTFish
3335      * node doesn't participate in any possible fold, e.g., punctuation, so it
3336      * can be matched only by itself.  The vast majority of characters that are
3337      * in folds match just two things, their lower and upper-case equivalents.
3338      * But not all are like that; some have multiple possible matches, or match
3339      * sequences of more than one character.  This function sorts all that out.
3340      *
3341      * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3342      * loop of trying to match A*, we know we can't exit where the thing
3343      * following it isn't a B.  And something can't be a B unless it is the
3344      * beginning of B.  By putting a quick test for that beginning in a tight
3345      * loop, we can rule out things that can't possibly be B without having to
3346      * break out of the loop, thus avoiding work.  Similarly, if A is a single
3347      * character, we can make a tight loop matching A*, using the outputs of
3348      * this function.
3349      *
3350      * If the target string to match isn't in UTF-8, and there aren't
3351      * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3352      * the one or two possible octets (which are characters in this situation)
3353      * that can match.  In all cases, if there is only one character that can
3354      * match, *<c1p> and *<c2p> will be identical.
3355      *
3356      * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3357      * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3358      * can match the beginning of <text_node>.  They should be declared with at
3359      * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3360      * undefined what these contain.)  If one or both of the buffers are
3361      * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3362      * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3363      * *<c2p> will be set to a negative number(s) that shouldn't match any code
3364      * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3365      * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3366
3367     const bool utf8_target = PL_reg_match_utf8;
3368
3369     UV c1 = CHRTEST_NOT_A_CP_1;
3370     UV c2 = CHRTEST_NOT_A_CP_2;
3371     bool use_chrtest_void = FALSE;
3372
3373     /* Used when we have both utf8 input and utf8 output, to avoid converting
3374      * to/from code points */
3375     bool utf8_has_been_setup = FALSE;
3376
3377     dVAR;
3378
3379     U8 *pat = (U8*)STRING(text_node);
3380
3381     if (OP(text_node) == EXACT) {
3382
3383         /* In an exact node, only one thing can be matched, that first
3384          * character.  If both the pat and the target are UTF-8, we can just
3385          * copy the input to the output, avoiding finding the code point of
3386          * that character */
3387         if (! UTF_PATTERN) {
3388             c2 = c1 = *pat;
3389         }
3390         else if (utf8_target) {
3391             Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3392             Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3393             utf8_has_been_setup = TRUE;
3394         }
3395         else {
3396             c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3397         }
3398     }
3399     else /* an EXACTFish node */
3400          if ((UTF_PATTERN
3401                     && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3402                                                     pat + STR_LEN(text_node)))
3403              || (! UTF_PATTERN
3404                     && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3405                                                     pat + STR_LEN(text_node))))
3406     {
3407         /* Multi-character folds require more context to sort out.  Also
3408          * PL_utf8_foldclosures used below doesn't handle them, so have to be
3409          * handled outside this routine */
3410         use_chrtest_void = TRUE;
3411     }
3412     else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3413         c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3414         if (c1 > 256) {
3415             /* Load the folds hash, if not already done */
3416             SV** listp;
3417             if (! PL_utf8_foldclosures) {
3418                 if (! PL_utf8_tofold) {
3419                     U8 dummy[UTF8_MAXBYTES+1];
3420
3421                     /* Force loading this by folding an above-Latin1 char */
3422                     to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3423                     assert(PL_utf8_tofold); /* Verify that worked */
3424                 }
3425                 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3426             }
3427
3428             /* The fold closures data structure is a hash with the keys being
3429              * the UTF-8 of every character that is folded to, like 'k', and
3430              * the values each an array of all code points that fold to its
3431              * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3432              * not included */
3433             if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3434                                      (char *) pat,
3435                                      UTF8SKIP(pat),
3436                                      FALSE))))
3437             {
3438                 /* Not found in the hash, therefore there are no folds
3439                  * containing it, so there is only a single character that
3440                  * could match */
3441                 c2 = c1;
3442             }
3443             else {  /* Does participate in folds */
3444                 AV* list = (AV*) *listp;
3445                 if (av_len(list) != 1) {
3446
3447                     /* If there aren't exactly two folds to this, it is outside
3448                      * the scope of this function */
3449                     use_chrtest_void = TRUE;
3450                 }
3451                 else {  /* There are two.  Get them */
3452                     SV** c_p = av_fetch(list, 0, FALSE);
3453                     if (c_p == NULL) {
3454                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3455                     }
3456                     c1 = SvUV(*c_p);
3457
3458                     c_p = av_fetch(list, 1, FALSE);
3459                     if (c_p == NULL) {
3460                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3461                     }
3462                     c2 = SvUV(*c_p);
3463
3464                     /* Folds that cross the 255/256 boundary are forbidden if
3465                      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3466                      * pattern character is above 256, and its only other match
3467                      * is below 256, the only legal match will be to itself.
3468                      * We have thrown away the original, so have to compute
3469                      * which is the one above 255 */
3470                     if ((c1 < 256) != (c2 < 256)) {
3471                         if (OP(text_node) == EXACTFL
3472                             || (OP(text_node) == EXACTFA
3473                                 && (isASCII(c1) || isASCII(c2))))
3474                         {
3475                             if (c1 < 256) {
3476                                 c1 = c2;
3477                             }
3478                             else {
3479                                 c2 = c1;
3480                             }
3481                         }
3482                     }
3483                 }
3484             }
3485         }
3486         else /* Here, c1 is < 255 */
3487              if (utf8_target
3488                  && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3489                  && OP(text_node) != EXACTFL
3490                  && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3491         {
3492             /* Here, there could be something above Latin1 in the target which
3493              * folds to this character in the pattern.  All such cases except
3494              * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3495              * involved in their folds, so are outside the scope of this
3496              * function */
3497             if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3498                 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3499             }
3500             else {
3501                 use_chrtest_void = TRUE;
3502             }
3503         }
3504         else { /* Here nothing above Latin1 can fold to the pattern character */
3505             switch (OP(text_node)) {
3506
3507                 case EXACTFL:   /* /l rules */
3508                     c2 = PL_fold_locale[c1];
3509                     break;
3510
3511                 case EXACTF:
3512                     if (! utf8_target) {    /* /d rules */
3513                         c2 = PL_fold[c1];
3514                         break;
3515                     }
3516                     /* FALLTHROUGH */
3517                     /* /u rules for all these.  This happens to work for
3518                      * EXACTFA as nothing in Latin1 folds to ASCII */
3519                 case EXACTFA:
3520                 case EXACTFU_TRICKYFOLD:
3521                 case EXACTFU_SS:
3522                 case EXACTFU:
3523                     c2 = PL_fold_latin1[c1];
3524                     break;
3525
3526                 default:
3527                     Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3528                     assert(0); /* NOTREACHED */
3529             }
3530         }
3531     }
3532
3533     /* Here have figured things out.  Set up the returns */
3534     if (use_chrtest_void) {
3535         *c2p = *c1p = CHRTEST_VOID;
3536     }
3537     else if (utf8_target) {
3538         if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3539             uvchr_to_utf8(c1_utf8, c1);
3540             uvchr_to_utf8(c2_utf8, c2);
3541         }
3542
3543         /* Invariants are stored in both the utf8 and byte outputs; Use
3544          * negative numbers otherwise for the byte ones.  Make sure that the
3545          * byte ones are the same iff the utf8 ones are the same */
3546         *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3547         *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3548                 ? *c2_utf8
3549                 : (c1 == c2)
3550                   ? CHRTEST_NOT_A_CP_1
3551                   : CHRTEST_NOT_A_CP_2;
3552     }
3553     else if (c1 > 255) {
3554        if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3555                            can represent */
3556            return FALSE;
3557        }
3558
3559        *c1p = *c2p = c2;    /* c2 is the only representable value */
3560     }
3561     else {  /* c1 is representable; see about c2 */
3562        *c1p = c1;
3563        *c2p = (c2 < 256) ? c2 : c1;
3564     }
3565
3566     return TRUE;
3567 }
3568
3569 /* returns -1 on failure, $+[0] on success */
3570 STATIC I32
3571 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3572 {
3573 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3574     dMY_CXT;
3575 #endif
3576     dVAR;
3577     const bool utf8_target = PL_reg_match_utf8;
3578     const U32 uniflags = UTF8_ALLOW_DEFAULT;
3579     REGEXP *rex_sv = reginfo->prog;
3580     regexp *rex = ReANY(rex_sv);
3581     RXi_GET_DECL(rex,rexi);
3582     I32 oldsave;
3583     /* the current state. This is a cached copy of PL_regmatch_state */
3584     regmatch_state *st;
3585     /* cache heavy used fields of st in registers */
3586     regnode *scan;
3587     regnode *next;
3588     U32 n = 0;  /* general value; init to avoid compiler warning */
3589     I32 ln = 0; /* len or last;  init to avoid compiler warning */
3590     char *locinput = startpos;
3591     char *pushinput; /* where to continue after a PUSH */
3592     I32 nextchr;   /* is always set to UCHARAT(locinput) */
3593
3594     bool result = 0;        /* return value of S_regmatch */
3595     int depth = 0;          /* depth of backtrack stack */
3596     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3597     const U32 max_nochange_depth =
3598         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3599         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3600     regmatch_state *yes_state = NULL; /* state to pop to on success of
3601                                                             subpattern */
3602     /* mark_state piggy backs on the yes_state logic so that when we unwind
3603        the stack on success we can update the mark_state as we go */
3604     regmatch_state *mark_state = NULL; /* last mark state we have seen */
3605     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3606     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3607     U32 state_num;
3608     bool no_final = 0;      /* prevent failure from backtracking? */
3609     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3610     char *startpoint = locinput;
3611     SV *popmark = NULL;     /* are we looking for a mark? */
3612     SV *sv_commit = NULL;   /* last mark name seen in failure */
3613     SV *sv_yes_mark = NULL; /* last mark name we have seen
3614                                during a successful match */
3615     U32 lastopen = 0;       /* last open we saw */
3616     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3617     SV* const oreplsv = GvSV(PL_replgv);
3618     /* these three flags are set by various ops to signal information to
3619      * the very next op. They have a useful lifetime of exactly one loop
3620      * iteration, and are not preserved or restored by state pushes/pops
3621      */
3622     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3623     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3624     int logical = 0;        /* the following EVAL is:
3625                                 0: (?{...})
3626                                 1: (?(?{...})X|Y)
3627                                 2: (??{...})
3628                                or the following IFMATCH/UNLESSM is:
3629                                 false: plain (?=foo)
3630                                 true:  used as a condition: (?(?=foo))
3631                             */
3632     PAD* last_pad = NULL;
3633     dMULTICALL;
3634     I32 gimme = G_SCALAR;
3635     CV *caller_cv = NULL;       /* who called us */
3636     CV *last_pushed_cv = NULL;  /* most recently called (?{}) CV */
3637     CHECKPOINT runops_cp;       /* savestack position before executing EVAL */
3638     U32 maxopenparen = 0;       /* max '(' index seen so far */
3639
3640 #ifdef DEBUGGING
3641     GET_RE_DEBUG_FLAGS_DECL;
3642 #endif
3643
3644     /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3645     multicall_oldcatch = 0;
3646     multicall_cv = NULL;
3647     cx = NULL;
3648     PERL_UNUSED_VAR(multicall_cop);
3649     PERL_UNUSED_VAR(newsp);
3650
3651
3652     PERL_ARGS_ASSERT_REGMATCH;
3653
3654     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3655             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3656     }));
3657     /* on first ever call to regmatch, allocate first slab */
3658     if (!PL_regmatch_slab) {
3659         Newx(PL_regmatch_slab, 1, regmatch_slab);
3660         PL_regmatch_slab->prev = NULL;
3661         PL_regmatch_slab->next = NULL;
3662         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3663     }
3664
3665     oldsave = PL_savestack_ix;
3666     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3667     SAVEVPTR(PL_regmatch_slab);
3668     SAVEVPTR(PL_regmatch_state);
3669
3670     /* grab next free state slot */
3671     st = ++PL_regmatch_state;
3672     if (st >  SLAB_LAST(PL_regmatch_slab))
3673         st = PL_regmatch_state = S_push_slab(aTHX);
3674
3675     /* Note that nextchr is a byte even in UTF */
3676     SET_nextchr;
3677     scan = prog;
3678     while (scan != NULL) {
3679
3680         DEBUG_EXECUTE_r( {
3681             SV * const prop = sv_newmortal();
3682             regnode *rnext=regnext(scan);
3683             DUMP_EXEC_POS( locinput, scan, utf8_target );
3684             regprop(rex, prop, scan);
3685
3686             PerlIO_printf(Perl_debug_log,
3687                     "%3"IVdf":%*s%s(%"IVdf")\n",
3688                     (IV)(scan - rexi->program), depth*2, "",
3689                     SvPVX_const(prop),
3690                     (PL_regkind[OP(scan)] == END || !rnext) ?
3691                         0 : (IV)(rnext - rexi->program));
3692         });
3693
3694         next = scan + NEXT_OFF(scan);
3695         if (next == scan)
3696             next = NULL;
3697         state_num = OP(scan);
3698
3699         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3700       reenter_switch:
3701
3702         SET_nextchr;
3703         assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3704
3705         switch (state_num) {
3706         case BOL: /*  /^../  */
3707             if (locinput == PL_bostr)
3708             {
3709                 /* reginfo->till = reginfo->bol; */
3710                 break;
3711             }
3712             sayNO;
3713
3714         case MBOL: /*  /^../m  */
3715             if (locinput == PL_bostr ||
3716                 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3717             {
3718                 break;
3719             }
3720             sayNO;
3721
3722         case SBOL: /*  /^../s  */
3723             if (locinput == PL_bostr)
3724                 break;
3725             sayNO;
3726
3727         case GPOS: /*  \G  */
3728             if (locinput == reginfo->ganch)
3729                 break;
3730             sayNO;
3731
3732         case KEEPS: /*   \K  */
3733             /* update the startpoint */
3734             st->u.keeper.val = rex->offs[0].start;
3735             rex->offs[0].start = locinput - PL_bostr;
3736             PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3737             assert(0); /*NOTREACHED*/
3738         case KEEPS_next_fail:
3739             /* rollback the start point change */
3740             rex->offs[0].start = st->u.keeper.val;
3741             sayNO_SILENT;
3742             assert(0); /*NOTREACHED*/
3743
3744         case EOL: /* /..$/  */
3745                 goto seol;
3746
3747         case MEOL: /* /..$/m  */
3748             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3749                 sayNO;
3750             break;
3751
3752         case SEOL: /* /..$/s  */
3753           seol:
3754             if (!NEXTCHR_IS_EOS && nextchr != '\n')
3755                 sayNO;
3756             if (PL_regeol - locinput > 1)
3757                 sayNO;
3758             break;
3759
3760         case EOS: /*  \z  */
3761             if (!NEXTCHR_IS_EOS)
3762                 sayNO;
3763             break;
3764
3765         case SANY: /*  /./s  */
3766             if (NEXTCHR_IS_EOS)
3767                 sayNO;
3768             goto increment_locinput;
3769
3770         case CANY: /*  \C  */
3771             if (NEXTCHR_IS_EOS)
3772                 sayNO;
3773             locinput++;
3774             break;
3775
3776         case REG_ANY: /*  /./  */
3777             if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3778                 sayNO;
3779             goto increment_locinput;
3780
3781
3782 #undef  ST
3783 #define ST st->u.trie
3784         case TRIEC: /* (ab|cd) with known charclass */
3785             /* In this case the charclass data is available inline so
3786                we can fail fast without a lot of extra overhead.
3787              */
3788             if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3789                 DEBUG_EXECUTE_r(
3790                     PerlIO_printf(Perl_debug_log,
3791                               "%*s  %sfailed to match trie start class...%s\n",
3792                               REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3793                 );
3794                 sayNO_SILENT;
3795                 assert(0); /* NOTREACHED */
3796             }
3797             /* FALL THROUGH */
3798         case TRIE:  /* (ab|cd)  */
3799             /* the basic plan of execution of the trie is:
3800              * At the beginning, run though all the states, and
3801              * find the longest-matching word. Also remember the position
3802              * of the shortest matching word. For example, this pattern:
3803              *    1  2 3 4    5
3804              *    ab|a|x|abcd|abc
3805              * when matched against the string "abcde", will generate
3806              * accept states for all words except 3, with the longest
3807              * matching word being 4, and the shortest being 2 (with
3808              * the position being after char 1 of the string).
3809              *
3810              * Then for each matching word, in word order (i.e. 1,2,4,5),
3811              * we run the remainder of the pattern; on each try setting
3812              * the current position to the character following the word,
3813              * returning to try the next word on failure.
3814              *
3815              * We avoid having to build a list of words at runtime by
3816              * using a compile-time structure, wordinfo[].prev, which
3817              * gives, for each word, the previous accepting word (if any).
3818              * In the case above it would contain the mappings 1->2, 2->0,
3819              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3820              * the longest word (4 above), a list of all words, by
3821              * following the list of prev pointers; this gives us the
3822              * unordered list 4,5,1,2. Then given the current word we have
3823              * just tried, we can go through the list and find the
3824              * next-biggest word to try (so if we just failed on word 2,
3825              * the next in the list is 4).
3826              *
3827              * Since at runtime we don't record the matching position in
3828              * the string for each word, we have to work that out for
3829              * each word we're about to process. The wordinfo table holds
3830              * the character length of each word; given that we recorded
3831              * at the start: the position of the shortest word and its
3832              * length in chars, we just need to move the pointer the
3833              * difference between the two char lengths. Depending on
3834              * Unicode status and folding, that's cheap or expensive.
3835              *
3836              * This algorithm is optimised for the case where are only a
3837              * small number of accept states, i.e. 0,1, or maybe 2.
3838              * With lots of accepts states, and having to try all of them,
3839              * it becomes quadratic on number of accept states to find all
3840              * the next words.
3841              */
3842
3843             {
3844                 /* what type of TRIE am I? (utf8 makes this contextual) */
3845                 DECL_TRIE_TYPE(scan);
3846
3847                 /* what trie are we using right now */
3848                 reg_trie_data * const trie
3849                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3850                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3851                 U32 state = trie->startstate;
3852
3853                 if (   trie->bitmap
3854                     && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3855                 {
3856                     if (trie->states[ state ].wordnum) {
3857                          DEBUG_EXECUTE_r(
3858                             PerlIO_printf(Perl_debug_log,
3859                                           "%*s  %smatched empty string...%s\n",
3860                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3861                         );
3862                         if (!trie->jump)
3863                             break;
3864                     } else {
3865                         DEBUG_EXECUTE_r(
3866                             PerlIO_printf(Perl_debug_log,
3867                                           "%*s  %sfailed to match trie start class...%s\n",
3868                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3869                         );
3870                         sayNO_SILENT;
3871                    }
3872                 }
3873
3874             {
3875                 U8 *uc = ( U8* )locinput;
3876
3877                 STRLEN len = 0;
3878                 STRLEN foldlen = 0;
3879                 U8 *uscan = (U8*)NULL;
3880                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3881                 U32 charcount = 0; /* how many input chars we have matched */
3882                 U32 accepted = 0; /* have we seen any accepting states? */
3883
3884                 ST.jump = trie->jump;
3885                 ST.me = scan;
3886                 ST.firstpos = NULL;
3887                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3888                 ST.nextword = 0;
3889
3890                 /* fully traverse the TRIE; note the position of the
3891                    shortest accept state and the wordnum of the longest
3892                    accept state */
3893
3894                 while ( state && uc <= (U8*)PL_regeol ) {
3895                     U32 base = trie->states[ state ].trans.base;
3896                     UV uvc = 0;
3897                     U16 charid = 0;
3898                     U16 wordnum;
3899                     wordnum = trie->states[ state ].wordnum;
3900
3901                     if (wordnum) { /* it's an accept state */
3902                         if (!accepted) {
3903                             accepted = 1;
3904                             /* record first match position */
3905                             if (ST.longfold) {
3906                                 ST.firstpos = (U8*)locinput;
3907                                 ST.firstchars = 0;
3908                             }
3909                             else {
3910                                 ST.firstpos = uc;
3911                                 ST.firstchars = charcount;
3912                             }
3913                         }
3914                         if (!ST.nextword || wordnum < ST.nextword)
3915                             ST.nextword = wordnum;
3916                         ST.topword = wordnum;
3917                     }
3918
3919                     DEBUG_TRIE_EXECUTE_r({
3920                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3921                                 PerlIO_printf( Perl_debug_log,
3922                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3923                                     2+depth * 2, "", PL_colors[4],
3924                                     (UV)state, (accepted ? 'Y' : 'N'));
3925                     });
3926
3927                     /* read a char and goto next state */
3928                     if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3929                         I32 offset;
3930                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3931                                              uscan, len, uvc, charid, foldlen,
3932                                              foldbuf, uniflags);
3933                         charcount++;
3934                         if (foldlen>0)
3935                             ST.longfold = TRUE;
3936                         if (charid &&
3937                              ( ((offset =
3938                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3939
3940                              && ((U32)offset < trie->lasttrans)
3941                              && trie->trans[offset].check == state)
3942                         {
3943                             state = trie->trans[offset].next;
3944                         }
3945                         else {
3946                             state = 0;
3947                         }
3948                         uc += len;
3949
3950                     }
3951                     else {
3952                         state = 0;
3953                     }
3954                     DEBUG_TRIE_EXECUTE_r(
3955                         PerlIO_printf( Perl_debug_log,
3956                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3957                             charid, uvc, (UV)state, PL_colors[5] );
3958                     );
3959                 }
3960                 if (!accepted)
3961                    sayNO;
3962
3963                 /* calculate total number of accept states */
3964                 {
3965                     U16 w = ST.topword;
3966                     accepted = 0;
3967                     while (w) {
3968                         w = trie->wordinfo[w].prev;
3969                         accepted++;
3970                     }
3971                     ST.accepted = accepted;
3972                 }
3973
3974                 DEBUG_EXECUTE_r(
3975                     PerlIO_printf( Perl_debug_log,
3976                         "%*s  %sgot %"IVdf" possible matches%s\n",
3977                         REPORT_CODE_OFF + depth * 2, "",
3978                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3979                 );
3980                 goto trie_first_try; /* jump into the fail handler */
3981             }}
3982             assert(0); /* NOTREACHED */
3983
3984         case TRIE_next_fail: /* we failed - try next alternative */
3985         {
3986             U8 *uc;
3987             if ( ST.jump) {
3988                 REGCP_UNWIND(ST.cp);
3989                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3990             }
3991             if (!--ST.accepted) {
3992                 DEBUG_EXECUTE_r({
3993                     PerlIO_printf( Perl_debug_log,
3994                         "%*s  %sTRIE failed...%s\n",
3995                         REPORT_CODE_OFF+depth*2, "",
3996                         PL_colors[4],
3997                         PL_colors[5] );
3998                 });
3999                 sayNO_SILENT;
4000             }
4001             {
4002                 /* Find next-highest word to process.  Note that this code
4003                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
4004                 U16 min = 0;
4005                 U16 word;
4006                 U16 const nextword = ST.nextword;
4007                 reg_trie_wordinfo * const wordinfo
4008                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4009                 for (word=ST.topword; word; word=wordinfo[word].prev) {
4010                     if (word > nextword && (!min || word < min))
4011                         min = word;
4012                 }
4013                 ST.nextword = min;
4014             }
4015
4016           trie_first_try:
4017             if (do_cutgroup) {
4018                 do_cutgroup = 0;
4019                 no_final = 0;
4020             }
4021
4022             if ( ST.jump) {
4023                 ST.lastparen = rex->lastparen;
4024                 ST.lastcloseparen = rex->lastcloseparen;
4025                 REGCP_SET(ST.cp);
4026             }
4027
4028             /* find start char of end of current word */
4029             {
4030                 U32 chars; /* how many chars to skip */
4031                 reg_trie_data * const trie
4032                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4033
4034                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4035                             >=  ST.firstchars);
4036                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4037                             - ST.firstchars;
4038                 uc = ST.firstpos;
4039
4040                 if (ST.longfold) {
4041                     /* the hard option - fold each char in turn and find
4042                      * its folded length (which may be different */
4043                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4044                     STRLEN foldlen;
4045                     STRLEN len;
4046                     UV uvc;
4047                     U8 *uscan;
4048
4049                     while (chars) {
4050                         if (utf8_target) {
4051                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4052                                                     uniflags);
4053                             uc += len;
4054                         }
4055                         else {
4056                             uvc = *uc;
4057                             uc++;
4058                         }
4059                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4060                         uscan = foldbuf;
4061                         while (foldlen) {
4062                             if (!--chars)
4063                                 break;
4064                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4065                                             uniflags);
4066                             uscan += len;
4067                             foldlen -= len;
4068                         }
4069                     }
4070                 }
4071                 else {
4072                     if (utf8_target)
4073                         while (chars--)
4074                             uc += UTF8SKIP(uc);
4075                     else
4076                         uc += chars;
4077                 }
4078             }
4079
4080             scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4081                             ? ST.jump[ST.nextword]
4082                             : NEXT_OFF(ST.me));
4083
4084             DEBUG_EXECUTE_r({
4085                 PerlIO_printf( Perl_debug_log,
4086                     "%*s  %sTRIE matched word #%d, continuing%s\n",
4087                     REPORT_CODE_OFF+depth*2, "",
4088                     PL_colors[4],
4089                     ST.nextword,
4090                     PL_colors[5]
4091                     );
4092             });
4093
4094             if (ST.accepted > 1 || has_cutgroup) {
4095                 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4096                 assert(0); /* NOTREACHED */
4097             }
4098             /* only one choice left - just continue */
4099             DEBUG_EXECUTE_r({
4100                 AV *const trie_words
4101                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4102                 SV ** const tmp = av_fetch( trie_words,
4103                     ST.nextword-1, 0 );
4104                 SV *sv= tmp ? sv_newmortal() : NULL;
4105
4106                 PerlIO_printf( Perl_debug_log,
4107                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4108                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4109                     ST.nextword,
4110                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4111                             PL_colors[0], PL_colors[1],
4112                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4113                         )
4114                     : "not compiled under -Dr",
4115                     PL_colors[5] );
4116             });
4117
4118             locinput = (char*)uc;
4119             continue; /* execute rest of RE */
4120             assert(0); /* NOTREACHED */
4121         }
4122 #undef  ST
4123
4124         case EXACT: {            /*  /abc/        */
4125             char *s = STRING(scan);
4126             ln = STR_LEN(scan);
4127             if (utf8_target != UTF_PATTERN) {
4128                 /* The target and the pattern have differing utf8ness. */
4129                 char *l = locinput;
4130                 const char * const e = s + ln;
4131
4132                 if (utf8_target) {
4133                     /* The target is utf8, the pattern is not utf8.
4134                      * Above-Latin1 code points can't match the pattern;
4135                      * invariants match exactly, and the other Latin1 ones need
4136                      * to be downgraded to a single byte in order to do the
4137                      * comparison.  (If we could be confident that the target
4138                      * is not malformed, this could be refactored to have fewer
4139                      * tests by just assuming that if the first bytes match, it
4140                      * is an invariant, but there are tests in the test suite
4141                      * dealing with (??{...}) which violate this) */
4142                     while (s < e) {
4143                         if (l >= PL_regeol)
4144                              sayNO;
4145                         if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4146                             sayNO;
4147                         }
4148                         if (UTF8_IS_INVARIANT(*(U8*)l)) {
4149                             if (*l != *s) {
4150                                 sayNO;
4151                             }
4152                             l++;
4153                         }
4154                         else {
4155                             if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4156                                 sayNO;
4157                             }
4158                             l += 2;
4159                         }
4160                         s++;
4161                     }
4162                 }
4163                 else {
4164                     /* The target is not utf8, the pattern is utf8. */
4165                     while (s < e) {
4166                         if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4167                         {
4168                             sayNO;
4169                         }
4170                         if (UTF8_IS_INVARIANT(*(U8*)s)) {
4171                             if (*s != *l) {
4172                                 sayNO;
4173                             }
4174                             s++;
4175                         }
4176                         else {
4177                             if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4178                                 sayNO;
4179                             }
4180                             s += 2;
4181                         }
4182                         l++;
4183                     }
4184                 }
4185                 locinput = l;
4186                 break;
4187             }
4188             /* The target and the pattern have the same utf8ness. */
4189             /* Inline the first character, for speed. */
4190             if (UCHARAT(s) != nextchr)
4191                 sayNO;
4192             if (PL_regeol - locinput < ln)
4193                 sayNO;
4194             if (ln > 1 && memNE(s, locinput, ln))
4195                 sayNO;
4196             locinput += ln;
4197             break;
4198             }
4199
4200         case EXACTFL: {          /*  /abc/il      */
4201             re_fold_t folder;
4202             const U8 * fold_array;
4203             const char * s;
4204             U32 fold_utf8_flags;
4205
4206             PL_reg_flags |= RF_tainted;
4207             folder = foldEQ_locale;
4208             fold_array = PL_fold_locale;
4209             fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4210             goto do_exactf;
4211
4212         case EXACTFU_SS:         /*  /\x{df}/iu   */
4213         case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4214         case EXACTFU:            /*  /abc/iu      */
4215             folder = foldEQ_latin1;
4216             fold_array = PL_fold_latin1;
4217             fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4218             goto do_exactf;
4219
4220         case EXACTFA:            /*  /abc/iaa     */
4221             folder = foldEQ_latin1;
4222             fold_array = PL_fold_latin1;
4223             fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4224             goto do_exactf;
4225
4226         case EXACTF:             /*  /abc/i       */
4227             folder = foldEQ;
4228             fold_array = PL_fold;
4229             fold_utf8_flags = 0;
4230
4231           do_exactf:
4232             s = STRING(scan);
4233             ln = STR_LEN(scan);
4234
4235             if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4236               /* Either target or the pattern are utf8, or has the issue where
4237                * the fold lengths may differ. */
4238                 const char * const l = locinput;
4239                 char *e = PL_regeol;
4240
4241                 if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
4242                                         l, &e, 0,  utf8_target, fold_utf8_flags))
4243                 {
4244                     sayNO;
4245                 }
4246                 locinput = e;
4247                 break;
4248             }
4249
4250             /* Neither the target nor the pattern are utf8 */
4251             if (UCHARAT(s) != nextchr
4252                 && !NEXTCHR_IS_EOS
4253                 && UCHARAT(s) != fold_array[nextchr])
4254             {
4255                 sayNO;
4256             }
4257             if (PL_regeol - locinput < ln)
4258                 sayNO;
4259             if (ln > 1 && ! folder(s, locinput, ln))
4260                 sayNO;
4261             locinput += ln;
4262             break;
4263         }
4264
4265         /* XXX Could improve efficiency by separating these all out using a
4266          * macro or in-line function.  At that point regcomp.c would no longer
4267          * have to set the FLAGS fields of these */
4268         case BOUNDL:  /*  /\b/l  */
4269         case NBOUNDL: /*  /\B/l  */
4270             PL_reg_flags |= RF_tainted;
4271             /* FALL THROUGH */
4272         case BOUND:   /*  /\b/   */
4273         case BOUNDU:  /*  /\b/u  */
4274         case BOUNDA:  /*  /\b/a  */
4275         case NBOUND:  /*  /\B/   */
4276         case NBOUNDU: /*  /\B/u  */
4277         case NBOUNDA: /*  /\B/a  */
4278             /* was last char in word? */
4279             if (utf8_target
4280                 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4281                 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4282             {
4283                 if (locinput == PL_bostr)
4284                     ln = '\n';
4285                 else {
4286                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4287
4288                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4289                 }
4290                 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4291                     ln = isALNUM_uni(ln);
4292                     if (NEXTCHR_IS_EOS)
4293                         n = 0;
4294                     else {
4295                         LOAD_UTF8_CHARCLASS_ALNUM();
4296                         n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4297                                                                 utf8_target);
4298                     }
4299                 }
4300                 else {
4301                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4302                     n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4303                 }
4304             }
4305             else {
4306
4307                 /* Here the string isn't utf8, or is utf8 and only ascii
4308                  * characters are to match \w.  In the latter case looking at
4309                  * the byte just prior to the current one may be just the final
4310                  * byte of a multi-byte character.  This is ok.  There are two
4311                  * cases:
4312                  * 1) it is a single byte character, and then the test is doing
4313                  *      just what it's supposed to.
4314                  * 2) it is a multi-byte character, in which case the final
4315                  *      byte is never mistakable for ASCII, and so the test
4316                  *      will say it is not a word character, which is the
4317                  *      correct answer. */
4318                 ln = (locinput != PL_bostr) ?
4319                     UCHARAT(locinput - 1) : '\n';
4320                 switch (FLAGS(scan)) {
4321                     case REGEX_UNICODE_CHARSET:
4322                         ln = isWORDCHAR_L1(ln);
4323                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4324                         break;
4325                     case REGEX_LOCALE_CHARSET:
4326                         ln = isALNUM_LC(ln);
4327                         n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4328                         break;
4329                     case REGEX_DEPENDS_CHARSET:
4330                         ln = isALNUM(ln);
4331                         n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4332                         break;
4333                     case REGEX_ASCII_RESTRICTED_CHARSET:
4334                     case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4335                         ln = isWORDCHAR_A(ln);
4336                         n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4337                         break;
4338                     default:
4339                         Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4340                         break;
4341                 }
4342             }
4343             /* Note requires that all BOUNDs be lower than all NBOUNDs in
4344              * regcomp.sym */
4345             if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4346                     sayNO;
4347             break;
4348
4349         case ANYOF:  /*  /[abc]/       */
4350             if (NEXTCHR_IS_EOS)
4351                 sayNO;
4352             if (utf8_target) {
4353                 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4354                     sayNO;
4355                 locinput += UTF8SKIP(locinput);
4356                 break;
4357             }
4358             else {
4359                 if (!REGINCLASS(rex, scan, (U8*)locinput))
4360                     sayNO;
4361                 locinput++;
4362                 break;
4363             }
4364             break;
4365
4366         /* Special char classes: \d, \w etc.
4367          * The defines start on line 166 or so */
4368         CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
4369                   ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4370                   ALNUMU, NALNUMU, isWORDCHAR_L1,
4371                   ALNUMA, NALNUMA, isWORDCHAR_A,
4372                   alnum, "a");
4373
4374         case SPACEL:
4375             PL_reg_flags |= RF_tainted;
4376             if (NEXTCHR_IS_EOS) {
4377                 sayNO;
4378             }
4379             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4380                 if (! isSPACE_LC_utf8((U8 *) locinput)) {
4381                     sayNO;
4382                 }
4383             }
4384             else if (! isSPACE_LC((U8) nextchr)) {
4385                     sayNO;
4386             }
4387             goto increment_locinput;
4388
4389         case NSPACEL:
4390             PL_reg_flags |= RF_tainted;
4391             if (NEXTCHR_IS_EOS) {
4392                 sayNO;
4393             }
4394             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4395                 if (isSPACE_LC_utf8((U8 *) locinput)) {
4396                     sayNO;
4397                 }
4398             }
4399             else if (isSPACE_LC(nextchr)) {
4400                     sayNO;
4401             }
4402             goto increment_locinput;
4403
4404         case SPACE:
4405             if (utf8_target) {
4406                 goto utf8_space;
4407             }
4408             /* FALL THROUGH */
4409         case SPACEA:
4410             if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
4411                 sayNO;
4412             }
4413             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4414             locinput++;
4415             break;
4416
4417         case NSPACE:
4418             if (utf8_target) {
4419                 goto utf8_nspace;
4420             }
4421             /* FALL THROUGH */
4422         case NSPACEA:
4423             if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
4424                 sayNO;
4425             }
4426             goto increment_locinput;
4427
4428         case SPACEU:
4429           utf8_space:
4430             if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
4431                 sayNO;
4432             }
4433             goto increment_locinput;
4434
4435         case NSPACEU:
4436           utf8_nspace:
4437             if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
4438                 sayNO;
4439             }
4440             goto increment_locinput;
4441
4442         CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
4443                 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4444                 DIGITA, NDIGITA, isDIGIT_A,
4445                 digit, "0");
4446
4447         case POSIXA: /* /[[:ascii:]]/ etc */
4448             if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4449                 sayNO;
4450             }
4451             /* Matched a utf8-invariant, so don't have to worry about utf8 */
4452             locinput++;
4453             break;
4454
4455         case NPOSIXA: /*  /[^[:ascii:]]/  etc */
4456             if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4457                 sayNO;
4458             }
4459             goto increment_locinput;
4460
4461         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4462                        a Unicode extended Grapheme Cluster */
4463             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4464               extended Grapheme Cluster is:
4465
4466                CR LF
4467                | Prepend* Begin Extend*
4468                | .
4469
4470                Begin is:           ( Special_Begin | ! Control )
4471                Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4472                Extend is:          ( Grapheme_Extend | Spacing_Mark )
4473                Control is:         [ GCB_Control | CR | LF ]
4474                Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4475
4476                If we create a 'Regular_Begin' = Begin - Special_Begin, then
4477                we can rewrite
4478
4479                    Begin is ( Regular_Begin + Special Begin )
4480
4481                It turns out that 98.4% of all Unicode code points match
4482                Regular_Begin.  Doing it this way eliminates a table match in
4483                the previous implementation for almost all Unicode code points.
4484
4485                There is a subtlety with Prepend* which showed up in testing.
4486                Note that the Begin, and only the Begin is required in:
4487                 | Prepend* Begin Extend*
4488                Also, Begin contains '! Control'.  A Prepend must be a
4489                '!  Control', which means it must also be a Begin.  What it
4490                comes down to is that if we match Prepend* and then find no
4491                suitable Begin afterwards, that if we backtrack the last
4492                Prepend, that one will be a suitable Begin.
4493             */
4494
4495             if (NEXTCHR_IS_EOS)
4496                 sayNO;
4497             if  (! utf8_target) {
4498
4499                 /* Match either CR LF  or '.', as all the other possibilities
4500                  * require utf8 */
4501                 locinput++;         /* Match the . or CR */
4502                 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4503                                        match the LF */
4504                     && locinput < PL_regeol
4505                     && UCHARAT(locinput) == '\n')
4506                 {
4507                     locinput++;
4508                 }
4509             }
4510             else {
4511
4512                 /* Utf8: See if is ( CR LF ); already know that locinput <
4513                  * PL_regeol, so locinput+1 is in bounds */
4514                 if ( nextchr == '\r' && locinput+1 < PL_regeol
4515                      && UCHARAT(locinput + 1) == '\n')
4516                 {
4517                     locinput += 2;
4518                 }
4519                 else {
4520                     STRLEN len;
4521
4522                     /* In case have to backtrack to beginning, then match '.' */
4523                     char *starting = locinput;
4524
4525                     /* In case have to backtrack the last prepend */
4526                     char *previous_prepend = NULL;
4527
4528                     LOAD_UTF8_CHARCLASS_GCB();
4529
4530                     /* Match (prepend)*   */
4531                     while (locinput < PL_regeol
4532                            && (len = is_GCB_Prepend_utf8(locinput)))
4533                     {
4534                         previous_prepend = locinput;
4535                         locinput += len;
4536                     }
4537
4538                     /* As noted above, if we matched a prepend character, but
4539                      * the next thing won't match, back off the last prepend we
4540                      * matched, as it is guaranteed to match the begin */
4541                     if (previous_prepend
4542                         && (locinput >=  PL_regeol
4543                             || (! swash_fetch(PL_utf8_X_regular_begin,
4544                                              (U8*)locinput, utf8_target)
4545                                  && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4546                         )
4547                     {
4548                         locinput = previous_prepend;
4549                     }
4550
4551                     /* Note that here we know PL_regeol > locinput, as we
4552                      * tested that upon input to this switch case, and if we
4553                      * moved locinput forward, we tested the result just above
4554                      * and it either passed, or we backed off so that it will
4555                      * now pass */
4556                     if (swash_fetch(PL_utf8_X_regular_begin,
4557                                     (U8*)locinput, utf8_target)) {
4558                         locinput += UTF8SKIP(locinput);
4559                     }
4560                     else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4561
4562                         /* Here did not match the required 'Begin' in the
4563                          * second term.  So just match the very first
4564                          * character, the '.' of the final term of the regex */
4565                         locinput = starting + UTF8SKIP(starting);
4566                         goto exit_utf8;
4567                     } else {
4568
4569                         /* Here is a special begin.  It can be composed of
4570                          * several individual characters.  One possibility is
4571                          * RI+ */
4572                         if ((len = is_GCB_RI_utf8(locinput))) {
4573                             locinput += len;
4574                             while (locinput < PL_regeol
4575                                    && (len = is_GCB_RI_utf8(locinput)))
4576                             {
4577                                 locinput += len;
4578                             }
4579                         } else if ((len = is_GCB_T_utf8(locinput))) {
4580                             /* Another possibility is T+ */
4581                             locinput += len;
4582                             while (locinput < PL_regeol
4583                                 && (len = is_GCB_T_utf8(locinput)))
4584                             {
4585                                 locinput += len;
4586                             }
4587                         } else {
4588
4589                             /* Here, neither RI+ nor T+; must be some other
4590                              * Hangul.  That means it is one of the others: L,
4591                              * LV, LVT or V, and matches:
4592                              * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4593
4594                             /* Match L*           */
4595                             while (locinput < PL_regeol
4596                                    && (len = is_GCB_L_utf8(locinput)))
4597                             {
4598                                 locinput += len;
4599                             }
4600
4601                             /* Here, have exhausted L*.  If the next character
4602                              * is not an LV, LVT nor V, it means we had to have
4603                              * at least one L, so matches L+ in the original
4604                              * equation, we have a complete hangul syllable.
4605                              * Are done. */
4606
4607                             if (locinput < PL_regeol
4608                                 && is_GCB_LV_LVT_V_utf8(locinput))
4609                             {
4610                                 /* Otherwise keep going.  Must be LV, LVT or V.
4611                                  * See if LVT, by first ruling out V, then LV */
4612                                 if (! is_GCB_V_utf8(locinput)
4613                                         /* All but every TCount one is LV */
4614                                     && (valid_utf8_to_uvchr((U8 *) locinput,
4615                                                                          NULL)
4616                                                                         - SBASE)
4617                                         % TCount != 0)
4618                                 {
4619                                     locinput += UTF8SKIP(locinput);
4620                                 } else {
4621
4622                                     /* Must be  V or LV.  Take it, then match
4623                                      * V*     */
4624                                     locinput += UTF8SKIP(locinput);
4625                                     while (locinput < PL_regeol
4626                                            && (len = is_GCB_V_utf8(locinput)))
4627                                     {
4628                                         locinput += len;
4629                                     }
4630                                 }
4631
4632                                 /* And any of LV, LVT, or V can be followed
4633                                  * by T*            */
4634                                 while (locinput < PL_regeol
4635                                        && (len = is_GCB_T_utf8(locinput)))
4636                                 {
4637                                     locinput += len;
4638                                 }
4639                             }
4640                         }
4641                     }
4642
4643                     /* Match any extender */
4644                     while (locinput < PL_regeol
4645                             && swash_fetch(PL_utf8_X_extend,
4646                                             (U8*)locinput, utf8_target))
4647                     {
4648                         locinput += UTF8SKIP(locinput);
4649                     }
4650                 }
4651             exit_utf8:
4652                 if (locinput > PL_regeol) sayNO;
4653             }
4654             break;
4655
4656         case NREFFL:  /*  /\g{name}/il  */
4657         {   /* The capture buffer cases.  The ones beginning with N for the
4658                named buffers just convert to the equivalent numbered and
4659                pretend they were called as the corresponding numbered buffer
4660                op.  */
4661             /* don't initialize these in the declaration, it makes C++
4662                unhappy */
4663             char *s;
4664             char type;
4665             re_fold_t folder;
4666             const U8 *fold_array;
4667             UV utf8_fold_flags;
4668
4669             PL_reg_flags |= RF_tainted;
4670             folder = foldEQ_locale;
4671             fold_array = PL_fold_locale;
4672             type = REFFL;
4673             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4674             goto do_nref;
4675
4676         case NREFFA:  /*  /\g{name}/iaa  */
4677             folder = foldEQ_latin1;
4678             fold_array = PL_fold_latin1;
4679             type = REFFA;
4680             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4681             goto do_nref;
4682
4683         case NREFFU:  /*  /\g{name}/iu  */
4684             folder = foldEQ_latin1;
4685             fold_array = PL_fold_latin1;
4686             type = REFFU;
4687             utf8_fold_flags = 0;
4688             goto do_nref;
4689
4690         case NREFF:  /*  /\g{name}/i  */
4691             folder = foldEQ;
4692             fold_array = PL_fold;
4693             type = REFF;
4694             utf8_fold_flags = 0;
4695             goto do_nref;
4696
4697         case NREF:  /*  /\g{name}/   */
4698             type = REF;
4699             folder = NULL;
4700             fold_array = NULL;
4701             utf8_fold_flags = 0;
4702           do_nref:
4703
4704             /* For the named back references, find the corresponding buffer
4705              * number */
4706             n = reg_check_named_buff_matched(rex,scan);
4707
4708             if ( ! n ) {
4709                 sayNO;
4710             }
4711             goto do_nref_ref_common;
4712
4713         case REFFL:  /*  /\1/il  */
4714             PL_reg_flags |= RF_tainted;
4715             folder = foldEQ_locale;
4716             fold_array = PL_fold_locale;
4717             utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4718             goto do_ref;
4719
4720         case REFFA:  /*  /\1/iaa  */
4721             folder = foldEQ_latin1;
4722             fold_array = PL_fold_latin1;
4723             utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4724             goto do_ref;
4725
4726         case REFFU:  /*  /\1/iu  */
4727             folder = foldEQ_latin1;
4728             fold_array = PL_fold_latin1;
4729             utf8_fold_flags = 0;
4730             goto do_ref;
4731
4732         case REFF:  /*  /\1/i  */
4733             folder = foldEQ;
4734             fold_array = PL_fold;
4735             utf8_fold_flags = 0;
4736             goto do_ref;
4737
4738         case REF:  /*  /\1/    */
4739             folder = NULL;
4740             fold_array = NULL;
4741             utf8_fold_flags = 0;
4742
4743           do_ref:
4744             type = OP(scan);
4745             n = ARG(scan);  /* which paren pair */
4746
4747           do_nref_ref_common:
4748             ln = rex->offs[n].start;
4749             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4750             if (rex->lastparen < n || ln == -1)
4751                 sayNO;                  /* Do not match unless seen CLOSEn. */
4752             if (ln == rex->offs[n].end)
4753                 break;
4754
4755             s = PL_bostr + ln;
4756             if (type != REF     /* REF can do byte comparison */
4757                 && (utf8_target || type == REFFU))
4758             { /* XXX handle REFFL better */
4759                 char * limit = PL_regeol;
4760
4761                 /* This call case insensitively compares the entire buffer
4762                     * at s, with the current input starting at locinput, but
4763                     * not going off the end given by PL_regeol, and returns in
4764                     * <limit> upon success, how much of the current input was
4765                     * matched */
4766                 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4767                                     locinput, &limit, 0, utf8_target, utf8_fold_flags))
4768                 {
4769                     sayNO;
4770                 }
4771                 locinput = limit;
4772                 break;
4773             }
4774
4775             /* Not utf8:  Inline the first character, for speed. */
4776             if (!NEXTCHR_IS_EOS &&
4777                 UCHARAT(s) != nextchr &&
4778                 (type == REF ||
4779                  UCHARAT(s) != fold_array[nextchr]))
4780                 sayNO;
4781             ln = rex->offs[n].end - ln;
4782             if (locinput + ln > PL_regeol)
4783                 sayNO;
4784             if (ln > 1 && (type == REF
4785                            ? memNE(s, locinput, ln)
4786                            : ! folder(s, locinput, ln)))
4787                 sayNO;
4788             locinput += ln;
4789             break;
4790         }
4791
4792         case NOTHING: /* null op; e.g. the 'nothing' following
4793                        * the '*' in m{(a+|b)*}' */
4794             break;
4795         case TAIL: /* placeholder while compiling (A|B|C) */
4796             break;
4797
4798         case BACK: /* ??? doesn't appear to be used ??? */
4799             break;
4800
4801 #undef  ST
4802 #define ST st->u.eval
4803         {
4804             SV *ret;
4805             REGEXP *re_sv;
4806             regexp *re;
4807             regexp_internal *rei;
4808             regnode *startpoint;
4809
4810         case GOSTART: /*  (?R)  */
4811         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4812             if (cur_eval && cur_eval->locinput==locinput) {
4813                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4814                     Perl_croak(aTHX_ "Infinite recursion in regex");
4815                 if ( ++nochange_depth > max_nochange_depth )
4816                     Perl_croak(aTHX_
4817                         "Pattern subroutine nesting without pos change"
4818                         " exceeded limit in regex");
4819             } else {
4820                 nochange_depth = 0;
4821             }
4822             re_sv = rex_sv;
4823             re = rex;
4824             rei = rexi;
4825             if (OP(scan)==GOSUB) {
4826                 startpoint = scan + ARG2L(scan);
4827                 ST.close_paren = ARG(scan);
4828             } else {
4829                 startpoint = rei->program+1;
4830                 ST.close_paren = 0;
4831             }
4832             goto eval_recurse_doit;
4833             assert(0); /* NOTREACHED */
4834
4835         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4836             if (cur_eval && cur_eval->locinput==locinput) {
4837                 if ( ++nochange_depth > max_nochange_depth )
4838                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4839             } else {
4840                 nochange_depth = 0;
4841             }
4842             {
4843                 /* execute the code in the {...} */
4844
4845                 dSP;
4846                 IV before;
4847                 OP * const oop = PL_op;
4848                 COP * const ocurcop = PL_curcop;
4849                 OP *nop;
4850                 char *saved_regeol = PL_regeol;
4851                 struct re_save_state saved_state;
4852                 CV *newcv;
4853
4854                 /* save *all* paren positions */
4855                 regcppush(rex, 0, maxopenparen);
4856                 REGCP_SET(runops_cp);
4857
4858                 /* To not corrupt the existing regex state while executing the
4859                  * eval we would normally put it on the save stack, like with
4860                  * save_re_context. However, re-evals have a weird scoping so we
4861                  * can't just add ENTER/LEAVE here. With that, things like
4862                  *
4863                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4864                  *
4865                  * would break, as they expect the localisation to be unwound
4866                  * only when the re-engine backtracks through the bit that
4867                  * localised it.
4868                  *
4869                  * What we do instead is just saving the state in a local c
4870                  * variable.
4871                  */
4872                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4873
4874                 PL_reg_state.re_reparsing = FALSE;
4875
4876                 if (!caller_cv)
4877                     caller_cv = find_runcv(NULL);
4878
4879                 n = ARG(scan);
4880
4881                 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4882                     newcv = (ReANY(
4883                                                 (REGEXP*)(rexi->data->data[n])
4884                                             ))->qr_anoncv
4885                                         ;
4886                     nop = (OP*)rexi->data->data[n+1];
4887                 }
4888                 else if (rexi->data->what[n] == 'l') { /* literal code */
4889                     newcv = caller_cv;
4890                     nop = (OP*)rexi->data->data[n];
4891                     assert(CvDEPTH(newcv));
4892                 }
4893                 else {
4894                     /* literal with own CV */
4895                     assert(rexi->data->what[n] == 'L');
4896                     newcv = rex->qr_anoncv;
4897                     nop = (OP*)rexi->data->data[n];
4898                 }
4899
4900                 /* normally if we're about to execute code from the same
4901                  * CV that we used previously, we just use the existing
4902                  * CX stack entry. However, its possible that in the
4903                  * meantime we may have backtracked, popped from the save
4904                  * stack, and undone the SAVECOMPPAD(s) associated with
4905                  * PUSH_MULTICALL; in which case PL_comppad no longer
4906                  * points to newcv's pad. */
4907                 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4908                 {
4909                     I32 depth = (newcv == caller_cv) ? 0 : 1;
4910                     if (last_pushed_cv) {
4911                         CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4912                     }
4913                     else {
4914                         PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4915                     }
4916                     last_pushed_cv = newcv;
4917                 }
4918                 else {
4919                     /* these assignments are just to silence compiler
4920                      * warnings */
4921                     multicall_cop = NULL;
4922                     newsp = NULL;
4923                 }
4924                 last_pad = PL_comppad;
4925
4926                 /* the initial nextstate you would normally execute
4927                  * at the start of an eval (which would cause error
4928                  * messages to come from the eval), may be optimised
4929                  * away from the execution path in the regex code blocks;
4930                  * so manually set PL_curcop to it initially */
4931                 {
4932                     OP *o = cUNOPx(nop)->op_first;
4933                     assert(o->op_type == OP_NULL);
4934                     if (o->op_targ == OP_SCOPE) {
4935                         o = cUNOPo->op_first;
4936                     }
4937                     else {
4938                         assert(o->op_targ == OP_LEAVE);
4939                         o = cUNOPo->op_first;
4940                         assert(o->op_type == OP_ENTER);
4941                         o = o->op_sibling;
4942                     }
4943
4944                     if (o->op_type != OP_STUB) {
4945                         assert(    o->op_type == OP_NEXTSTATE
4946                                 || o->op_type == OP_DBSTATE
4947                                 || (o->op_type == OP_NULL
4948                                     &&  (  o->op_targ == OP_NEXTSTATE
4949                                         || o->op_targ == OP_DBSTATE
4950                                         )
4951                                     )
4952                         );
4953                         PL_curcop = (COP*)o;
4954                     }
4955                 }
4956                 nop = nop->op_next;
4957
4958                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4959                     "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4960
4961                 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4962
4963                 if (sv_yes_mark) {
4964                     SV *sv_mrk = get_sv("REGMARK", 1);
4965                     sv_setsv(sv_mrk, sv_yes_mark);
4966                 }
4967
4968                 /* we don't use MULTICALL here as we want to call the
4969                  * first op of the block of interest, rather than the
4970                  * first op of the sub */
4971                 before = (IV)(SP-PL_stack_base);
4972                 PL_op = nop;
4973                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4974                 SPAGAIN;
4975                 if ((IV)(SP-PL_stack_base) == before)
4976                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4977                 else {
4978                     ret = POPs;
4979                     PUTBACK;
4980                 }
4981
4982                 /* before restoring everything, evaluate the returned
4983                  * value, so that 'uninit' warnings don't use the wrong
4984                  * PL_op or pad. Also need to process any magic vars
4985                  * (e.g. $1) *before* parentheses are restored */
4986
4987                 PL_op = NULL;
4988
4989                 re_sv = NULL;
4990                 if (logical == 0)        /*   (?{})/   */
4991                     sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4992                 else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4993                     sw = cBOOL(SvTRUE(ret));
4994                     logical = 0;
4995                 }
4996                 else {                   /*  /(??{})  */
4997                     /*  if its overloaded, let the regex compiler handle
4998                      *  it; otherwise extract regex, or stringify  */
4999                     if (!SvAMAGIC(ret)) {
5000                         SV *sv = ret;
5001                         if (SvROK(sv))
5002                             sv = SvRV(sv);
5003                         if (SvTYPE(sv) == SVt_REGEXP)
5004                             re_sv = (REGEXP*) sv;
5005                         else if (SvSMAGICAL(sv)) {
5006                             MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5007                             if (mg)
5008                                 re_sv = (REGEXP *) mg->mg_obj;
5009                         }
5010
5011                         /* force any magic, undef warnings here */
5012                         if (!re_sv) {
5013                             ret = sv_mortalcopy(ret);
5014                             (void) SvPV_force_nolen(ret);
5015                         }
5016                     }
5017
5018                 }
5019
5020                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5021
5022                 /* *** Note that at this point we don't restore
5023                  * PL_comppad, (or pop the CxSUB) on the assumption it may
5024                  * be used again soon. This is safe as long as nothing
5025                  * in the regexp code uses the pad ! */
5026                 PL_op = oop;
5027                 PL_curcop = ocurcop;
5028                 PL_regeol = saved_regeol;
5029                 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5030
5031                 if (logical != 2)
5032                     break;
5033             }
5034
5035                 /* only /(??{})/  from now on */
5036                 logical = 0;
5037                 {
5038                     /* extract RE object from returned value; compiling if
5039                      * necessary */
5040
5041                     if (re_sv) {
5042                         re_sv = reg_temp_copy(NULL, re_sv);
5043                     }
5044                     else {
5045                         U32 pm_flags = 0;
5046
5047                         if (SvUTF8(ret) && IN_BYTES) {
5048                             /* In use 'bytes': make a copy of the octet
5049                              * sequence, but without the flag on */
5050                             STRLEN len;
5051                             const char *const p = SvPV(ret, len);
5052                             ret = newSVpvn_flags(p, len, SVs_TEMP);
5053                         }
5054                         if (rex->intflags & PREGf_USE_RE_EVAL)
5055                             pm_flags |= PMf_USE_RE_EVAL;
5056
5057                         /* if we got here, it should be an engine which
5058                          * supports compiling code blocks and stuff */
5059                         assert(rex->engine && rex->engine->op_comp);
5060                         assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5061                         re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5062                                     rex->engine, NULL, NULL,
5063                                     /* copy /msix etc to inner pattern */
5064                                     scan->flags,
5065                                     pm_flags);
5066
5067                         if (!(SvFLAGS(ret)
5068                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5069                                  | SVs_GMG))) {
5070                             /* This isn't a first class regexp. Instead, it's
5071                                caching a regexp onto an existing, Perl visible
5072                                scalar.  */
5073                             sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5074                         }
5075                         /* safe to do now that any $1 etc has been
5076                          * interpolated into the new pattern string and
5077                          * compiled */
5078                         S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5079                     }
5080                     SAVEFREESV(re_sv);
5081                     re = ReANY(re_sv);
5082                 }
5083                 RXp_MATCH_COPIED_off(re);
5084                 re->subbeg = rex->subbeg;
5085                 re->sublen = rex->sublen;
5086                 re->suboffset = rex->suboffset;
5087                 re->subcoffset = rex->subcoffset;
5088                 rei = RXi_GET(re);
5089                 DEBUG_EXECUTE_r(
5090                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
5091                         "Matching embedded");
5092                 );
5093                 startpoint = rei->program + 1;
5094                 ST.close_paren = 0; /* only used for GOSUB */
5095
5096         eval_recurse_doit: /* Share code with GOSUB below this line */
5097                 /* run the pattern returned from (??{...}) */
5098
5099                 /* Save *all* the positions. */
5100                 ST.cp = regcppush(rex, 0, maxopenparen);
5101                 REGCP_SET(ST.lastcp);
5102
5103                 re->lastparen = 0;
5104                 re->lastcloseparen = 0;
5105
5106                 maxopenparen = 0;
5107
5108                 /* XXXX This is too dramatic a measure... */
5109                 PL_reg_maxiter = 0;
5110
5111                 ST.toggle_reg_flags = PL_reg_flags;
5112                 if (RX_UTF8(re_sv))
5113                     PL_reg_flags |= RF_utf8;
5114                 else
5115                     PL_reg_flags &= ~RF_utf8;
5116                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5117
5118                 ST.prev_rex = rex_sv;
5119                 ST.prev_curlyx = cur_curlyx;
5120                 rex_sv = re_sv;
5121                 SET_reg_curpm(rex_sv);
5122                 rex = re;
5123                 rexi = rei;
5124                 cur_curlyx = NULL;
5125                 ST.B = next;
5126                 ST.prev_eval = cur_eval;
5127                 cur_eval = st;
5128                 /* now continue from first node in postoned RE */
5129                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5130                 assert(0); /* NOTREACHED */
5131         }
5132
5133         case EVAL_AB: /* cleanup after a successful (??{A})B */
5134             /* note: this is called twice; first after popping B, then A */
5135             PL_reg_flags ^= ST.toggle_reg_flags;
5136             rex_sv = ST.prev_rex;
5137             SET_reg_curpm(rex_sv);
5138             rex = ReANY(rex_sv);
5139             rexi = RXi_GET(rex);
5140             regcpblow(ST.cp);
5141             cur_eval = ST.prev_eval;
5142             cur_curlyx = ST.prev_curlyx;
5143
5144             /* XXXX This is too dramatic a measure... */
5145             PL_reg_maxiter = 0;
5146             if ( nochange_depth )
5147                 nochange_depth--;
5148             sayYES;
5149
5150
5151         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5152             /* note: this is called twice; first after popping B, then A */
5153             PL_reg_flags ^= ST.toggle_reg_flags;
5154             rex_sv = ST.prev_rex;
5155             SET_reg_curpm(rex_sv);
5156             rex = ReANY(rex_sv);
5157             rexi = RXi_GET(rex);
5158
5159             REGCP_UNWIND(ST.lastcp);
5160             regcppop(rex, &maxopenparen);
5161             cur_eval = ST.prev_eval;
5162             cur_curlyx = ST.prev_curlyx;
5163             /* XXXX This is too dramatic a measure... */
5164             PL_reg_maxiter = 0;
5165             if ( nochange_depth )
5166                 nochange_depth--;
5167             sayNO_SILENT;
5168 #undef ST
5169
5170         case OPEN: /*  (  */
5171             n = ARG(scan);  /* which paren pair */
5172             rex->offs[n].start_tmp = locinput - PL_bostr;
5173             if (n > maxopenparen)
5174                 maxopenparen = n;
5175             DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5176                 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5177                 PTR2UV(rex),
5178                 PTR2UV(rex->offs),
5179                 (UV)n,
5180                 (IV)rex->offs[n].start_tmp,
5181                 (UV)maxopenparen
5182             ));
5183             lastopen = n;
5184             break;
5185
5186 /* XXX really need to log other places start/end are set too */
5187 #define CLOSE_CAPTURE \
5188     rex->offs[n].start = rex->offs[n].start_tmp; \
5189     rex->offs[n].end = locinput - PL_bostr; \
5190     DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5191         "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5192         PTR2UV(rex), \
5193         PTR2UV(rex->offs), \
5194         (UV)n, \
5195         (IV)rex->offs[n].start, \
5196         (IV)rex->offs[n].end \
5197     ))
5198
5199         case CLOSE:  /*  )  */
5200             n = ARG(scan);  /* which paren pair */
5201             CLOSE_CAPTURE;
5202             if (n > rex->lastparen)
5203                 rex->lastparen = n;
5204             rex->lastcloseparen = n;
5205             if (cur_eval && cur_eval->u.eval.close_paren == n) {
5206                 goto fake_end;
5207             }
5208             break;
5209
5210         case ACCEPT:  /*  (*ACCEPT)  */
5211             if (ARG(scan)){
5212                 regnode *cursor;
5213                 for (cursor=scan;
5214                      cursor && OP(cursor)!=END;
5215                      cursor=regnext(cursor))
5216                 {
5217                     if ( OP(cursor)==CLOSE ){
5218                         n = ARG(cursor);
5219                         if ( n <= lastopen ) {
5220                             CLOSE_CAPTURE;
5221                             if (n > rex->lastparen)
5222                                 rex->lastparen = n;
5223                             rex->lastcloseparen = n;
5224                             if ( n == ARG(scan) || (cur_eval &&
5225                                 cur_eval->u.eval.close_paren == n))
5226                                 break;
5227                         }
5228                     }
5229                 }
5230             }
5231             goto fake_end;
5232             /*NOTREACHED*/
5233
5234         case GROUPP:  /*  (?(1))  */
5235             n = ARG(scan);  /* which paren pair */
5236             sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5237             break;
5238
5239         case NGROUPP:  /*  (?(<name>))  */
5240             /* reg_check_named_buff_matched returns 0 for no match */
5241             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5242             break;
5243
5244         case INSUBP:   /*  (?(R))  */
5245             n = ARG(scan);
5246             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5247             break;
5248
5249         case DEFINEP:  /*  (?(DEFINE))  */
5250             sw = 0;
5251             break;
5252
5253         case IFTHEN:   /*  (?(cond)A|B)  */
5254             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
5255             if (sw)
5256                 next = NEXTOPER(NEXTOPER(scan));
5257             else {
5258                 next = scan + ARG(scan);
5259                 if (OP(next) == IFTHEN) /* Fake one. */
5260                     next = NEXTOPER(NEXTOPER(next));
5261             }
5262             break;
5263
5264         case LOGICAL:  /* modifier for EVAL and IFMATCH */
5265             logical = scan->flags;
5266             break;
5267
5268 /*******************************************************************
5269
5270 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5271 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5272 STAR/PLUS/CURLY/CURLYN are used instead.)
5273
5274 A*B is compiled as <CURLYX><A><WHILEM><B>
5275
5276 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5277 state, which contains the current count, initialised to -1. It also sets
5278 cur_curlyx to point to this state, with any previous value saved in the
5279 state block.
5280
5281 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5282 since the pattern may possibly match zero times (i.e. it's a while {} loop
5283 rather than a do {} while loop).
5284
5285 Each entry to WHILEM represents a successful match of A. The count in the
5286 CURLYX block is incremented, another WHILEM state is pushed, and execution
5287 passes to A or B depending on greediness and the current count.
5288
5289 For example, if matching against the string a1a2a3b (where the aN are
5290 substrings that match /A/), then the match progresses as follows: (the
5291 pushed states are interspersed with the bits of strings matched so far):
5292
5293     <CURLYX cnt=-1>
5294     <CURLYX cnt=0><WHILEM>
5295     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5296     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5297     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5298     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5299
5300 (Contrast this with something like CURLYM, which maintains only a single
5301 backtrack state:
5302
5303     <CURLYM cnt=0> a1
5304     a1 <CURLYM cnt=1> a2
5305     a1 a2 <CURLYM cnt=2> a3
5306     a1 a2 a3 <CURLYM cnt=3> b
5307 )
5308
5309 Each WHILEM state block marks a point to backtrack to upon partial failure
5310 of A or B, and also contains some minor state data related to that
5311 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5312 overall state, such as the count, and pointers to the A and B ops.
5313
5314 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5315 must always point to the *current* CURLYX block, the rules are:
5316
5317 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5318 and set cur_curlyx to point the new block.
5319
5320 When popping the CURLYX block after a successful or unsuccessful match,
5321 restore the previous cur_curlyx.
5322
5323 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5324 to the outer one saved in the CURLYX block.
5325
5326 When popping the WHILEM block after a successful or unsuccessful B match,
5327 restore the previous cur_curlyx.
5328
5329 Here's an example for the pattern (AI* BI)*BO
5330 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5331
5332 cur_
5333 curlyx backtrack stack
5334 ------ ---------------
5335 NULL
5336 CO     <CO prev=NULL> <WO>
5337 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5338 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5339 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5340
5341 At this point the pattern succeeds, and we work back down the stack to
5342 clean up, restoring as we go:
5343
5344 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5345 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5346 CO     <CO prev=NULL> <WO>
5347 NULL
5348
5349 *******************************************************************/
5350
5351 #define ST st->u.curlyx
5352
5353         case CURLYX:    /* start of /A*B/  (for complex A) */
5354         {
5355             /* No need to save/restore up to this paren */
5356             I32 parenfloor = scan->flags;
5357
5358             assert(next); /* keep Coverity happy */
5359             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5360                 next += ARG(next);
5361
5362             /* XXXX Probably it is better to teach regpush to support
5363                parenfloor > maxopenparen ... */
5364             if (parenfloor > (I32)rex->lastparen)
5365                 parenfloor = rex->lastparen; /* Pessimization... */
5366
5367             ST.prev_curlyx= cur_curlyx;
5368             cur_curlyx = st;
5369             ST.cp = PL_savestack_ix;
5370
5371             /* these fields contain the state of the current curly.
5372              * they are accessed by subsequent WHILEMs */
5373             ST.parenfloor = parenfloor;
5374             ST.me = scan;
5375             ST.B = next;
5376             ST.minmod = minmod;
5377             minmod = 0;
5378             ST.count = -1;      /* this will be updated by WHILEM */
5379             ST.lastloc = NULL;  /* this will be updated by WHILEM */
5380
5381             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5382             assert(0); /* NOTREACHED */
5383         }
5384
5385         case CURLYX_end: /* just finished matching all of A*B */
5386             cur_curlyx = ST.prev_curlyx;
5387             sayYES;
5388             assert(0); /* NOTREACHED */
5389
5390         case CURLYX_end_fail: /* just failed to match all of A*B */
5391             regcpblow(ST.cp);
5392             cur_curlyx = ST.prev_curlyx;
5393             sayNO;
5394             assert(0); /* NOTREACHED */
5395
5396
5397 #undef ST
5398 #define ST st->u.whilem
5399
5400         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5401         {
5402             /* see the discussion above about CURLYX/WHILEM */
5403             I32 n;
5404             int min = ARG1(cur_curlyx->u.curlyx.me);
5405             int max = ARG2(cur_curlyx->u.curlyx.me);
5406             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5407
5408             assert(cur_curlyx); /* keep Coverity happy */
5409             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5410             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5411             ST.cache_offset = 0;
5412             ST.cache_mask = 0;
5413
5414
5415             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5416                   "%*s  whilem: matched %ld out of %d..%d\n",
5417                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5418             );
5419
5420             /* First just match a string of min A's. */
5421
5422             if (n < min) {
5423                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5424                                     maxopenparen);
5425                 cur_curlyx->u.curlyx.lastloc = locinput;
5426                 REGCP_SET(ST.lastcp);
5427
5428                 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5429                 assert(0); /* NOTREACHED */
5430             }
5431
5432             /* If degenerate A matches "", assume A done. */
5433
5434             if (locinput == cur_curlyx->u.curlyx.lastloc) {
5435                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5436                    "%*s  whilem: empty match detected, trying continuation...\n",
5437                    REPORT_CODE_OFF+depth*2, "")
5438                 );
5439                 goto do_whilem_B_max;
5440             }
5441
5442             /* super-linear cache processing */
5443
5444             if (scan->flags) {
5445
5446                 if (!PL_reg_maxiter) {
5447                     /* start the countdown: Postpone detection until we
5448                      * know the match is not *that* much linear. */
5449                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5450                     /* possible overflow for long strings and many CURLYX's */
5451                     if (PL_reg_maxiter < 0)
5452                         PL_reg_maxiter = I32_MAX;
5453                     PL_reg_leftiter = PL_reg_maxiter;
5454                 }
5455
5456                 if (PL_reg_leftiter-- == 0) {
5457                     /* initialise cache */
5458                     const I32 size = (PL_reg_maxiter + 7)/8;
5459                     if (PL_reg_poscache) {
5460                         if ((I32)PL_reg_poscache_size < size) {
5461                             Renew(PL_reg_poscache, size, char);
5462                             PL_reg_poscache_size = size;
5463                         }
5464                         Zero(PL_reg_poscache, size, char);
5465                     }
5466                     else {
5467                         PL_reg_poscache_size = size;
5468                         Newxz(PL_reg_poscache, size, char);
5469                     }
5470                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5471       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5472                               PL_colors[4], PL_colors[5])
5473                     );
5474                 }
5475
5476                 if (PL_reg_leftiter < 0) {
5477                     /* have we already failed at this position? */
5478                     I32 offset, mask;
5479                     offset  = (scan->flags & 0xf) - 1
5480                                 + (locinput - PL_bostr)  * (scan->flags>>4);
5481                     mask    = 1 << (offset % 8);
5482                     offset /= 8;
5483                     if (PL_reg_poscache[offset] & mask) {
5484                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5485                             "%*s  whilem: (cache) already tried at this position...\n",
5486                             REPORT_CODE_OFF+depth*2, "")
5487                         );
5488                         sayNO; /* cache records failure */
5489                     }
5490                     ST.cache_offset = offset;
5491                     ST.cache_mask   = mask;
5492                 }
5493             }
5494
5495             /* Prefer B over A for minimal matching. */
5496
5497             if (cur_curlyx->u.curlyx.minmod) {
5498                 ST.save_curlyx = cur_curlyx;
5499                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5500                 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5501                             maxopenparen);
5502                 REGCP_SET(ST.lastcp);
5503                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5504                                     locinput);
5505                 assert(0); /* NOTREACHED */
5506             }
5507
5508             /* Prefer A over B for maximal matching. */
5509
5510             if (n < max) { /* More greed allowed? */
5511                 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5512                             maxopenparen);
5513                 cur_curlyx->u.curlyx.lastloc = locinput;
5514                 REGCP_SET(ST.lastcp);
5515                 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5516                 assert(0); /* NOTREACHED */
5517             }
5518             goto do_whilem_B_max;
5519         }
5520         assert(0); /* NOTREACHED */
5521
5522         case WHILEM_B_min: /* just matched B in a minimal match */
5523         case WHILEM_B_max: /* just matched B in a maximal match */
5524             cur_curlyx = ST.save_curlyx;
5525             sayYES;
5526             assert(0); /* NOTREACHED */
5527
5528         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5529             cur_curlyx = ST.save_curlyx;
5530             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5531             cur_curlyx->u.curlyx.count--;
5532             CACHEsayNO;
5533             assert(0); /* NOTREACHED */
5534
5535         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5536             /* FALL THROUGH */
5537         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5538             REGCP_UNWIND(ST.lastcp);
5539             regcppop(rex, &maxopenparen);
5540             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5541             cur_curlyx->u.curlyx.count--;
5542             CACHEsayNO;
5543             assert(0); /* NOTREACHED */
5544
5545         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5546             REGCP_UNWIND(ST.lastcp);
5547             regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5548             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5549                 "%*s  whilem: failed, trying continuation...\n",
5550                 REPORT_CODE_OFF+depth*2, "")
5551             );
5552           do_whilem_B_max:
5553             if (cur_curlyx->u.curlyx.count >= REG_INFTY
5554                 && ckWARN(WARN_REGEXP)
5555                 && !(PL_reg_flags & RF_warned))
5556             {
5557                 PL_reg_flags |= RF_warned;
5558                 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5559                      "Complex regular subexpression recursion limit (%d) "
5560                      "exceeded",
5561                      REG_INFTY - 1);
5562             }
5563
5564             /* now try B */
5565             ST.save_curlyx = cur_curlyx;
5566             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5567             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5568                                 locinput);
5569             assert(0); /* NOTREACHED */
5570
5571         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5572             cur_curlyx = ST.save_curlyx;
5573             REGCP_UNWIND(ST.lastcp);
5574             regcppop(rex, &maxopenparen);
5575
5576             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5577                 /* Maximum greed exceeded */
5578                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5579                     && ckWARN(WARN_REGEXP)
5580                     && !(PL_reg_flags & RF_warned))
5581                 {
5582                     PL_reg_flags |= RF_warned;
5583                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5584                         "Complex regular subexpression recursion "
5585                         "limit (%d) exceeded",
5586                         REG_INFTY - 1);
5587                 }
5588                 cur_curlyx->u.curlyx.count--;
5589                 CACHEsayNO;
5590             }
5591
5592             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5593                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5594             );
5595             /* Try grabbing another A and see if it helps. */
5596             cur_curlyx->u.curlyx.lastloc = locinput;
5597             ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5598                             maxopenparen);
5599             REGCP_SET(ST.lastcp);
5600             PUSH_STATE_GOTO(WHILEM_A_min,
5601                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5602                 locinput);
5603             assert(0); /* NOTREACHED */
5604
5605 #undef  ST
5606 #define ST st->u.branch
5607
5608         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
5609             next = scan + ARG(scan);
5610             if (next == scan)
5611                 next = NULL;
5612             scan = NEXTOPER(scan);
5613             /* FALL THROUGH */
5614
5615         case BRANCH:        /*  /(...|A|...)/ */
5616             scan = NEXTOPER(scan); /* scan now points to inner node */
5617             ST.lastparen = rex->lastparen;
5618             ST.lastcloseparen = rex->lastcloseparen;
5619             ST.next_branch = next;
5620             REGCP_SET(ST.cp);
5621
5622             /* Now go into the branch */
5623             if (has_cutgroup) {
5624                 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5625             } else {
5626                 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5627             }
5628             assert(0); /* NOTREACHED */
5629
5630         case CUTGROUP:  /*  /(*THEN)/  */
5631             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5632                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5633             PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5634             assert(0); /* NOTREACHED */
5635
5636         case CUTGROUP_next_fail:
5637             do_cutgroup = 1;
5638             no_final = 1;
5639             if (st->u.mark.mark_name)
5640                 sv_commit = st->u.mark.mark_name;
5641             sayNO;
5642             assert(0); /* NOTREACHED */
5643
5644         case BRANCH_next:
5645             sayYES;
5646             assert(0); /* NOTREACHED */
5647
5648         case BRANCH_next_fail: /* that branch failed; try the next, if any */
5649             if (do_cutgroup) {
5650                 do_cutgroup = 0;
5651                 no_final = 0;
5652             }
5653             REGCP_UNWIND(ST.cp);
5654             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5655             scan = ST.next_branch;
5656             /* no more branches? */
5657             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5658                 DEBUG_EXECUTE_r({
5659                     PerlIO_printf( Perl_debug_log,
5660                         "%*s  %sBRANCH failed...%s\n",
5661                         REPORT_CODE_OFF+depth*2, "",
5662                         PL_colors[4],
5663                         PL_colors[5] );
5664                 });
5665                 sayNO_SILENT;
5666             }
5667             continue; /* execute next BRANCH[J] op */
5668             assert(0); /* NOTREACHED */
5669
5670         case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5671             minmod = 1;
5672             break;
5673
5674 #undef  ST
5675 #define ST st->u.curlym
5676
5677         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
5678
5679             /* This is an optimisation of CURLYX that enables us to push
5680              * only a single backtracking state, no matter how many matches
5681              * there are in {m,n}. It relies on the pattern being constant
5682              * length, with no parens to influence future backrefs
5683              */
5684
5685             ST.me = scan;
5686             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5687
5688             ST.lastparen      = rex->lastparen;
5689             ST.lastcloseparen = rex->lastcloseparen;
5690
5691             /* if paren positive, emulate an OPEN/CLOSE around A */
5692             if (ST.me->flags) {
5693                 U32 paren = ST.me->flags;
5694                 if (paren > maxopenparen)
5695                     maxopenparen = paren;
5696                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5697             }
5698             ST.A = scan;
5699             ST.B = next;
5700             ST.alen = 0;
5701             ST.count = 0;
5702             ST.minmod = minmod;
5703             minmod = 0;
5704             ST.c1 = CHRTEST_UNINIT;
5705             REGCP_SET(ST.cp);
5706
5707             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5708                 goto curlym_do_B;
5709
5710           curlym_do_A: /* execute the A in /A{m,n}B/  */
5711             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5712             assert(0); /* NOTREACHED */
5713
5714         case CURLYM_A: /* we've just matched an A */
5715             ST.count++;
5716             /* after first match, determine A's length: u.curlym.alen */
5717             if (ST.count == 1) {
5718                 if (PL_reg_match_utf8) {
5719                     char *s = st->locinput;
5720                     while (s < locinput) {
5721                         ST.alen++;
5722                         s += UTF8SKIP(s);
5723                     }
5724                 }
5725                 else {
5726                     ST.alen = locinput - st->locinput;
5727                 }
5728                 if (ST.alen == 0)
5729                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5730             }
5731             DEBUG_EXECUTE_r(
5732                 PerlIO_printf(Perl_debug_log,
5733                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5734                           (int)(REPORT_CODE_OFF+(depth*2)), "",
5735                           (IV) ST.count, (IV)ST.alen)
5736             );
5737
5738             if (cur_eval && cur_eval->u.eval.close_paren &&
5739                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5740                 goto fake_end;
5741
5742             {
5743                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5744                 if ( max == REG_INFTY || ST.count < max )
5745                     goto curlym_do_A; /* try to match another A */
5746             }
5747             goto curlym_do_B; /* try to match B */
5748
5749         case CURLYM_A_fail: /* just failed to match an A */
5750             REGCP_UNWIND(ST.cp);
5751
5752             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5753                 || (cur_eval && cur_eval->u.eval.close_paren &&
5754                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5755                 sayNO;
5756
5757           curlym_do_B: /* execute the B in /A{m,n}B/  */
5758             if (ST.c1 == CHRTEST_UNINIT) {
5759                 /* calculate c1 and c2 for possible match of 1st char
5760                  * following curly */
5761                 ST.c1 = ST.c2 = CHRTEST_VOID;
5762                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5763                     regnode *text_node = ST.B;
5764                     if (! HAS_TEXT(text_node))
5765                         FIND_NEXT_IMPT(text_node);
5766                     /* this used to be
5767
5768                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5769
5770                         But the former is redundant in light of the latter.
5771
5772                         if this changes back then the macro for
5773                         IS_TEXT and friends need to change.
5774                      */
5775                     if (PL_regkind[OP(text_node)] == EXACT) {
5776                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5777                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5778                         {
5779                             sayNO;
5780                         }
5781                     }
5782                 }
5783             }
5784
5785             DEBUG_EXECUTE_r(
5786                 PerlIO_printf(Perl_debug_log,
5787                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5788                     (int)(REPORT_CODE_OFF+(depth*2)),
5789                     "", (IV)ST.count)
5790                 );
5791             if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5792                 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5793                     if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5794                         && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5795                     {
5796                         /* simulate B failing */
5797                         DEBUG_OPTIMISE_r(
5798                             PerlIO_printf(Perl_debug_log,
5799                                 "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5800                                 (int)(REPORT_CODE_OFF+(depth*2)),"",
5801                                 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5802                                 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5803                                 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5804                         );
5805                         state_num = CURLYM_B_fail;
5806                         goto reenter_switch;
5807                     }
5808                 }
5809                 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5810                     /* simulate B failing */
5811                     DEBUG_OPTIMISE_r(
5812                         PerlIO_printf(Perl_debug_log,
5813                             "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5814                             (int)(REPORT_CODE_OFF+(depth*2)),"",
5815                             (int) nextchr, ST.c1, ST.c2)
5816                     );
5817                     state_num = CURLYM_B_fail;
5818                     goto reenter_switch;
5819                 }
5820             }
5821
5822             if (ST.me->flags) {
5823                 /* emulate CLOSE: mark current A as captured */
5824                 I32 paren = ST.me->flags;
5825                 if (ST.count) {
5826                     rex->offs[paren].start
5827                         = HOPc(locinput, -ST.alen) - PL_bostr;
5828                     rex->offs[paren].end = locinput - PL_bostr;
5829                     if ((U32)paren > rex->lastparen)
5830                         rex->lastparen = paren;
5831                     rex->lastcloseparen = paren;
5832                 }
5833                 else
5834                     rex->offs[paren].end = -1;
5835                 if (cur_eval && cur_eval->u.eval.close_paren &&
5836                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5837                 {
5838                     if (ST.count)
5839                         goto fake_end;
5840                     else
5841                         sayNO;
5842                 }
5843             }
5844
5845             PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5846             assert(0); /* NOTREACHED */
5847
5848         case CURLYM_B_fail: /* just failed to match a B */
5849             REGCP_UNWIND(ST.cp);
5850             UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5851             if (ST.minmod) {
5852                 I32 max = ARG2(ST.me);
5853                 if (max != REG_INFTY && ST.count == max)
5854                     sayNO;
5855                 goto curlym_do_A; /* try to match a further A */
5856             }
5857             /* backtrack one A */
5858             if (ST.count == ARG1(ST.me) /* min */)
5859                 sayNO;
5860             ST.count--;
5861             SET_locinput(HOPc(locinput, -ST.alen));
5862             goto curlym_do_B; /* try to match B */
5863
5864 #undef ST
5865 #define ST st->u.curly
5866
5867 #define CURLY_SETPAREN(paren, success) \
5868     if (paren) { \
5869         if (success) { \
5870             rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5871             rex->offs[paren].end = locinput - PL_bostr; \
5872             if (paren > rex->lastparen) \
5873                 rex->lastparen = paren; \
5874             rex->lastcloseparen = paren; \
5875         } \
5876         else { \
5877             rex->offs[paren].end = -1; \
5878             rex->lastparen      = ST.lastparen; \
5879             rex->lastcloseparen = ST.lastcloseparen; \
5880         } \
5881     }
5882
5883         case STAR:              /*  /A*B/ where A is width 1 char */
5884             ST.paren = 0;
5885             ST.min = 0;
5886             ST.max = REG_INFTY;
5887             scan = NEXTOPER(scan);
5888             goto repeat;
5889
5890         case PLUS:              /*  /A+B/ where A is width 1 char */
5891             ST.paren = 0;
5892             ST.min = 1;
5893             ST.max = REG_INFTY;
5894             scan = NEXTOPER(scan);
5895             goto repeat;
5896
5897         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 char */
5898             ST.paren = scan->flags;     /* Which paren to set */
5899             ST.lastparen      = rex->lastparen;
5900             ST.lastcloseparen = rex->lastcloseparen;
5901             if (ST.paren > maxopenparen)
5902                 maxopenparen = ST.paren;
5903             ST.min = ARG1(scan);  /* min to match */
5904             ST.max = ARG2(scan);  /* max to match */
5905             if (cur_eval && cur_eval->u.eval.close_paren &&
5906                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5907                 ST.min=1;
5908                 ST.max=1;
5909             }
5910             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5911             goto repeat;
5912
5913         case CURLY:             /*  /A{m,n}B/ where A is width 1 char */
5914             ST.paren = 0;
5915             ST.min = ARG1(scan);  /* min to match */
5916             ST.max = ARG2(scan);  /* max to match */
5917             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5918           repeat:
5919             /*
5920             * Lookahead to avoid useless match attempts
5921             * when we know what character comes next.
5922             *
5923             * Used to only do .*x and .*?x, but now it allows
5924             * for )'s, ('s and (?{ ... })'s to be in the way
5925             * of the quantifier and the EXACT-like node.  -- japhy
5926             */
5927
5928             assert(ST.min <= ST.max);
5929             if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5930                 ST.c1 = ST.c2 = CHRTEST_VOID;
5931             }
5932             else {
5933                 regnode *text_node = next;
5934
5935                 if (! HAS_TEXT(text_node))
5936                     FIND_NEXT_IMPT(text_node);
5937
5938                 if (! HAS_TEXT(text_node))
5939                     ST.c1 = ST.c2 = CHRTEST_VOID;
5940                 else {
5941                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5942                         ST.c1 = ST.c2 = CHRTEST_VOID;
5943                     }
5944                     else {
5945
5946                     /*  Currently we only get here when
5947
5948                         PL_rekind[OP(text_node)] == EXACT
5949
5950                         if this changes back then the macro for IS_TEXT and
5951                         friends need to change. */
5952                         if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5953                            text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5954                         {
5955                             sayNO;
5956                         }
5957                     }
5958                 }
5959             }
5960
5961             ST.A = scan;
5962             ST.B = next;
5963             if (minmod) {
5964                 char *li = locinput;
5965                 minmod = 0;
5966                 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5967                     sayNO;
5968                 SET_locinput(li);
5969                 ST.count = ST.min;
5970                 REGCP_SET(ST.cp);
5971                 if (ST.c1 == CHRTEST_VOID)
5972                     goto curly_try_B_min;
5973
5974                 ST.oldloc = locinput;
5975
5976                 /* set ST.maxpos to the furthest point along the
5977                  * string that could possibly match */
5978                 if  (ST.max == REG_INFTY) {
5979                     ST.maxpos = PL_regeol - 1;
5980                     if (utf8_target)
5981                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5982                             ST.maxpos--;
5983                 }
5984                 else if (utf8_target) {
5985                     int m = ST.max - ST.min;
5986                     for (ST.maxpos = locinput;
5987                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5988                         ST.maxpos += UTF8SKIP(ST.maxpos);
5989                 }
5990                 else {
5991                     ST.maxpos = locinput + ST.max - ST.min;
5992                     if (ST.maxpos >= PL_regeol)
5993                         ST.maxpos = PL_regeol - 1;
5994                 }
5995                 goto curly_try_B_min_known;
5996
5997             }
5998             else {
5999                 /* avoid taking address of locinput, so it can remain
6000                  * a register var */
6001                 char *li = locinput;
6002                 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
6003                 if (ST.count < ST.min)
6004                     sayNO;
6005                 SET_locinput(li);
6006                 if ((ST.count > ST.min)
6007                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6008                 {
6009                     /* A{m,n} must come at the end of the string, there's
6010                      * no point in backing off ... */
6011                     ST.min = ST.count;
6012                     /* ...except that $ and \Z can match before *and* after
6013                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6014                        We may back off by one in this case. */
6015                     if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6016                         ST.min--;
6017                 }
6018                 REGCP_SET(ST.cp);
6019                 goto curly_try_B_max;
6020             }
6021             assert(0); /* NOTREACHED */
6022
6023
6024         case CURLY_B_min_known_fail:
6025             /* failed to find B in a non-greedy match where c1,c2 valid */
6026
6027             REGCP_UNWIND(ST.cp);
6028             if (ST.paren) {
6029                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6030             }
6031             /* Couldn't or didn't -- move forward. */
6032             ST.oldloc = locinput;
6033             if (utf8_target)
6034                 locinput += UTF8SKIP(locinput);
6035             else
6036                 locinput++;
6037             ST.count++;
6038           curly_try_B_min_known:
6039              /* find the next place where 'B' could work, then call B */
6040             {
6041                 int n;
6042                 if (utf8_target) {
6043                     n = (ST.oldloc == locinput) ? 0 : 1;
6044                     if (ST.c1 == ST.c2) {
6045                         /* set n to utf8_distance(oldloc, locinput) */
6046                         while (locinput <= ST.maxpos
6047                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6048                         {
6049                             locinput += UTF8SKIP(locinput);
6050                             n++;
6051                         }
6052                     }
6053                     else {
6054                         /* set n to utf8_distance(oldloc, locinput) */
6055                         while (locinput <= ST.maxpos
6056                               && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6057                               && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6058                         {
6059                             locinput += UTF8SKIP(locinput);
6060                             n++;
6061                         }
6062                     }
6063                 }
6064                 else {  /* Not utf8_target */
6065                     if (ST.c1 == ST.c2) {
6066                         while (locinput <= ST.maxpos &&
6067                                UCHARAT(locinput) != ST.c1)
6068                             locinput++;
6069                     }
6070                     else {
6071                         while (locinput <= ST.maxpos
6072                                && UCHARAT(locinput) != ST.c1
6073                                && UCHARAT(locinput) != ST.c2)
6074                             locinput++;
6075                     }
6076                     n = locinput - ST.oldloc;
6077                 }
6078                 if (locinput > ST.maxpos)
6079                     sayNO;
6080                 if (n) {
6081                     /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6082                      * at b; check that everything between oldloc and
6083                      * locinput matches */
6084                     char *li = ST.oldloc;
6085                     ST.count += n;
6086                     if (regrepeat(rex, &li, ST.A, n, depth) < n)
6087                         sayNO;
6088                     assert(n == REG_INFTY || locinput == li);
6089                 }
6090                 CURLY_SETPAREN(ST.paren, ST.count);
6091                 if (cur_eval && cur_eval->u.eval.close_paren &&
6092                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6093                     goto fake_end;
6094                 }
6095                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6096             }
6097             assert(0); /* NOTREACHED */
6098
6099
6100         case CURLY_B_min_fail:
6101             /* failed to find B in a non-greedy match where c1,c2 invalid */
6102
6103             REGCP_UNWIND(ST.cp);
6104             if (ST.paren) {
6105                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6106             }
6107             /* failed -- move forward one */
6108             {
6109                 char *li = locinput;
6110                 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
6111                     sayNO;
6112                 }
6113                 locinput = li;
6114             }
6115             {
6116                 ST.count++;
6117                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6118                         ST.count > 0)) /* count overflow ? */
6119                 {
6120                   curly_try_B_min:
6121                     CURLY_SETPAREN(ST.paren, ST.count);
6122                     if (cur_eval && cur_eval->u.eval.close_paren &&
6123                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
6124                         goto fake_end;
6125                     }
6126                     PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6127                 }
6128             }
6129             sayNO;
6130             assert(0); /* NOTREACHED */
6131
6132
6133         curly_try_B_max:
6134             /* a successful greedy match: now try to match B */
6135             if (cur_eval && cur_eval->u.eval.close_paren &&
6136                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6137                 goto fake_end;
6138             }
6139             {
6140                 bool could_match = locinput < PL_regeol;
6141
6142                 /* If it could work, try it. */
6143                 if (ST.c1 != CHRTEST_VOID && could_match) {
6144                     if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6145                     {
6146                         could_match = memEQ(locinput,
6147                                             ST.c1_utf8,
6148                                             UTF8SKIP(locinput))
6149                                     || memEQ(locinput,
6150                                              ST.c2_utf8,
6151                                              UTF8SKIP(locinput));
6152                     }
6153                     else {
6154                         could_match = UCHARAT(locinput) == ST.c1
6155                                       || UCHARAT(locinput) == ST.c2;
6156                     }
6157                 }
6158                 if (ST.c1 == CHRTEST_VOID || could_match) {
6159                     CURLY_SETPAREN(ST.paren, ST.count);
6160                     PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6161                     assert(0); /* NOTREACHED */
6162                 }
6163             }
6164             /* FALL THROUGH */
6165
6166         case CURLY_B_max_fail:
6167             /* failed to find B in a greedy match */
6168
6169             REGCP_UNWIND(ST.cp);
6170             if (ST.paren) {
6171                 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6172             }
6173             /*  back up. */
6174             if (--ST.count < ST.min)
6175                 sayNO;
6176             locinput = HOPc(locinput, -1);
6177             goto curly_try_B_max;
6178
6179 #undef ST
6180
6181         case END: /*  last op of main pattern  */
6182             fake_end:
6183             if (cur_eval) {
6184                 /* we've just finished A in /(??{A})B/; now continue with B */
6185                 st->u.eval.toggle_reg_flags
6186                             = cur_eval->u.eval.toggle_reg_flags;
6187                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6188
6189                 st->u.eval.prev_rex = rex_sv;           /* inner */
6190
6191                 /* Save *all* the positions. */
6192                 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6193                 rex_sv = cur_eval->u.eval.prev_rex;
6194                 SET_reg_curpm(rex_sv);
6195                 rex = ReANY(rex_sv);
6196                 rexi = RXi_GET(rex);
6197                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6198
6199                 REGCP_SET(st->u.eval.lastcp);
6200
6201                 /* Restore parens of the outer rex without popping the
6202                  * savestack */
6203                 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6204                                         &maxopenparen);
6205
6206                 st->u.eval.prev_eval = cur_eval;
6207                 cur_eval = cur_eval->u.eval.prev_eval;
6208                 DEBUG_EXECUTE_r(
6209                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6210                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6211                 if ( nochange_depth )
6212                     nochange_depth--;
6213
6214                 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6215                                     locinput); /* match B */
6216             }
6217
6218             if (locinput < reginfo->till) {
6219                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6220                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6221                                       PL_colors[4],
6222                                       (long)(locinput - PL_reg_starttry),
6223                                       (long)(reginfo->till - PL_reg_starttry),
6224                                       PL_colors[5]));
6225
6226                 sayNO_SILENT;           /* Cannot match: too short. */
6227             }
6228             sayYES;                     /* Success! */
6229
6230         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6231             DEBUG_EXECUTE_r(
6232             PerlIO_printf(Perl_debug_log,
6233                 "%*s  %ssubpattern success...%s\n",
6234                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6235             sayYES;                     /* Success! */
6236
6237 #undef  ST
6238 #define ST st->u.ifmatch
6239
6240         {
6241             char *newstart;
6242
6243         case SUSPEND:   /* (?>A) */
6244             ST.wanted = 1;
6245             newstart = locinput;
6246             goto do_ifmatch;
6247
6248         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
6249             ST.wanted = 0;
6250             goto ifmatch_trivial_fail_test;
6251
6252         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
6253             ST.wanted = 1;
6254           ifmatch_trivial_fail_test:
6255             if (scan->flags) {
6256                 char * const s = HOPBACKc(locinput, scan->flags);
6257                 if (!s) {
6258                     /* trivial fail */
6259                     if (logical) {
6260                         logical = 0;
6261                         sw = 1 - cBOOL(ST.wanted);
6262                     }
6263                     else if (ST.wanted)
6264                         sayNO;
6265                     next = scan + ARG(scan);
6266                     if (next == scan)
6267                         next = NULL;
6268                     break;
6269                 }
6270                 newstart = s;
6271             }
6272             else
6273                 newstart = locinput;
6274
6275           do_ifmatch:
6276             ST.me = scan;
6277             ST.logical = logical;
6278             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6279
6280             /* execute body of (?...A) */
6281             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6282             assert(0); /* NOTREACHED */
6283         }
6284
6285         case IFMATCH_A_fail: /* body of (?...A) failed */
6286             ST.wanted = !ST.wanted;
6287             /* FALL THROUGH */
6288
6289         case IFMATCH_A: /* body of (?...A) succeeded */
6290             if (ST.logical) {
6291                 sw = cBOOL(ST.wanted);
6292             }
6293             else if (!ST.wanted)
6294                 sayNO;
6295
6296             if (OP(ST.me) != SUSPEND) {
6297                 /* restore old position except for (?>...) */
6298                 locinput = st->locinput;
6299             }
6300             scan = ST.me + ARG(ST.me);
6301             if (scan == ST.me)
6302                 scan = NULL;
6303             continue; /* execute B */
6304
6305 #undef ST
6306
6307         case LONGJMP: /*  alternative with many branches compiles to
6308                        * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6309             next = scan + ARG(scan);
6310             if (next == scan)
6311                 next = NULL;
6312             break;
6313
6314         case COMMIT:  /*  (*COMMIT)  */
6315             reginfo->cutpoint = PL_regeol;
6316             /* FALLTHROUGH */
6317
6318         case PRUNE:   /*  (*PRUNE)   */
6319             if (!scan->flags)
6320                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6321             PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6322             assert(0); /* NOTREACHED */
6323
6324         case COMMIT_next_fail:
6325             no_final = 1;
6326             /* FALLTHROUGH */
6327
6328         case OPFAIL:   /* (*FAIL)  */
6329             sayNO;
6330             assert(0); /* NOTREACHED */
6331
6332 #define ST st->u.mark
6333         case MARKPOINT: /*  (*MARK:foo)  */
6334             ST.prev_mark = mark_state;
6335             ST.mark_name = sv_commit = sv_yes_mark
6336                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6337             mark_state = st;
6338             ST.mark_loc = locinput;
6339             PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6340             assert(0); /* NOTREACHED */
6341
6342         case MARKPOINT_next:
6343             mark_state = ST.prev_mark;
6344             sayYES;
6345             assert(0); /* NOTREACHED */
6346
6347         case MARKPOINT_next_fail:
6348             if (popmark && sv_eq(ST.mark_name,popmark))
6349             {
6350                 if (ST.mark_loc > startpoint)
6351                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6352                 popmark = NULL; /* we found our mark */
6353                 sv_commit = ST.mark_name;
6354
6355                 DEBUG_EXECUTE_r({
6356                         PerlIO_printf(Perl_debug_log,
6357                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6358                             REPORT_CODE_OFF+depth*2, "",
6359                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6360                 });
6361             }
6362             mark_state = ST.prev_mark;
6363             sv_yes_mark = mark_state ?
6364                 mark_state->u.mark.mark_name : NULL;
6365             sayNO;
6366             assert(0); /* NOTREACHED */
6367
6368         case SKIP:  /*  (*SKIP)  */
6369             if (scan->flags) {
6370                 /* (*SKIP) : if we fail we cut here*/
6371                 ST.mark_name = NULL;
6372                 ST.mark_loc = locinput;
6373                 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6374             } else {
6375                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6376                    otherwise do nothing.  Meaning we need to scan
6377                  */
6378                 regmatch_state *cur = mark_state;
6379                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6380
6381                 while (cur) {
6382                     if ( sv_eq( cur->u.mark.mark_name,
6383                                 find ) )
6384                     {
6385                         ST.mark_name = find;
6386                         PUSH_STATE_GOTO( SKIP_next, next, locinput);
6387                     }
6388                     cur = cur->u.mark.prev_mark;
6389                 }
6390             }
6391             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6392             break;
6393
6394         case SKIP_next_fail:
6395             if (ST.mark_name) {
6396                 /* (*CUT:NAME) - Set up to search for the name as we
6397                    collapse the stack*/
6398                 popmark = ST.mark_name;
6399             } else {
6400                 /* (*CUT) - No name, we cut here.*/
6401                 if (ST.mark_loc > startpoint)
6402                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6403                 /* but we set sv_commit to latest mark_name if there
6404                    is one so they can test to see how things lead to this
6405                    cut */
6406                 if (mark_state)
6407                     sv_commit=mark_state->u.mark.mark_name;
6408             }
6409             no_final = 1;
6410             sayNO;
6411             assert(0); /* NOTREACHED */
6412 #undef ST
6413
6414         case LNBREAK: /* \R */
6415             if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6416                 locinput += n;
6417             } else
6418                 sayNO;
6419             break;
6420
6421 #define CASE_CLASS(nAmE)                              \
6422         case nAmE:                                    \
6423             if (NEXTCHR_IS_EOS)                       \
6424                 sayNO;                                \
6425             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6426                 locinput += n;                        \
6427             } else                                    \
6428                 sayNO;                                \
6429             break;                                    \
6430         case N##nAmE:                                 \
6431             if (NEXTCHR_IS_EOS)                       \
6432                 sayNO;                                \
6433             if ((n=is_##nAmE(locinput,utf8_target))) {    \
6434                 sayNO;                                \
6435             } else {                                  \
6436                 locinput += UTF8SKIP(locinput);       \
6437             }                                         \
6438             break
6439
6440         CASE_CLASS(VERTWS);  /*  \v \V  */
6441         CASE_CLASS(HORIZWS); /*  \h \H  */
6442 #undef CASE_CLASS
6443
6444         default:
6445             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6446                           PTR2UV(scan), OP(scan));
6447             Perl_croak(aTHX_ "regexp memory corruption");
6448
6449         /* this is a point to jump to in order to increment
6450          * locinput by one character */
6451         increment_locinput:
6452             assert(!NEXTCHR_IS_EOS);
6453             if (utf8_target) {
6454                 locinput += PL_utf8skip[nextchr];
6455                 /* locinput is allowed to go 1 char off the end, but not 2+ */
6456                 if (locinput > PL_regeol)
6457                     sayNO;
6458             }
6459             else
6460                 locinput++;
6461             break;
6462
6463         } /* end switch */
6464
6465         /* switch break jumps here */
6466         scan = next; /* prepare to execute the next op and ... */
6467         continue;    /* ... jump back to the top, reusing st */
6468         assert(0); /* NOTREACHED */
6469
6470       push_yes_state:
6471         /* push a state that backtracks on success */
6472         st->u.yes.prev_yes_state = yes_state;
6473         yes_state = st;
6474         /* FALL THROUGH */
6475       push_state:
6476         /* push a new regex state, then continue at scan  */
6477         {
6478             regmatch_state *newst;
6479
6480             DEBUG_STACK_r({
6481                 regmatch_state *cur = st;
6482                 regmatch_state *curyes = yes_state;
6483                 int curd = depth;
6484                 regmatch_slab *slab = PL_regmatch_slab;
6485                 for (;curd > -1;cur--,curd--) {
6486                     if (cur < SLAB_FIRST(slab)) {
6487                         slab = slab->prev;
6488                         cur = SLAB_LAST(slab);
6489                     }
6490                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6491                         REPORT_CODE_OFF + 2 + depth * 2,"",
6492                         curd, PL_reg_name[cur->resume_state],
6493                         (curyes == cur) ? "yes" : ""
6494                     );
6495                     if (curyes == cur)
6496                         curyes = cur->u.yes.prev_yes_state;
6497                 }
6498             } else
6499                 DEBUG_STATE_pp("push")
6500             );
6501             depth++;
6502             st->locinput = locinput;
6503             newst = st+1;
6504             if (newst >  SLAB_LAST(PL_regmatch_slab))
6505                 newst = S_push_slab(aTHX);
6506             PL_regmatch_state = newst;
6507
6508             locinput = pushinput;
6509             st = newst;
6510             continue;
6511             assert(0); /* NOTREACHED */
6512         }
6513     }
6514
6515     /*
6516     * We get here only if there's trouble -- normally "case END" is
6517     * the terminating point.
6518     */
6519     Perl_croak(aTHX_ "corrupted regexp pointers");
6520     /*NOTREACHED*/
6521     sayNO;
6522
6523 yes:
6524     if (yes_state) {
6525         /* we have successfully completed a subexpression, but we must now
6526          * pop to the state marked by yes_state and continue from there */
6527         assert(st != yes_state);
6528 #ifdef DEBUGGING
6529         while (st != yes_state) {
6530             st--;
6531             if (st < SLAB_FIRST(PL_regmatch_slab)) {
6532                 PL_regmatch_slab = PL_regmatch_slab->prev;
6533                 st = SLAB_LAST(PL_regmatch_slab);
6534             }
6535             DEBUG_STATE_r({
6536                 if (no_final) {
6537                     DEBUG_STATE_pp("pop (no final)");
6538                 } else {
6539                     DEBUG_STATE_pp("pop (yes)");
6540                 }
6541             });
6542             depth--;
6543         }
6544 #else
6545         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6546             || yes_state > SLAB_LAST(PL_regmatch_slab))
6547         {
6548             /* not in this slab, pop slab */
6549             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6550             PL_regmatch_slab = PL_regmatch_slab->prev;
6551             st = SLAB_LAST(PL_regmatch_slab);
6552         }
6553         depth -= (st - yes_state);
6554 #endif
6555         st = yes_state;
6556         yes_state = st->u.yes.prev_yes_state;
6557         PL_regmatch_state = st;
6558
6559         if (no_final)
6560             locinput= st->locinput;
6561         state_num = st->resume_state + no_final;
6562         goto reenter_switch;
6563     }
6564
6565     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6566                           PL_colors[4], PL_colors[5]));
6567
6568     if (PL_reg_state.re_state_eval_setup_done) {
6569         /* each successfully executed (?{...}) block does the equivalent of
6570          *   local $^R = do {...}
6571          * When popping the save stack, all these locals would be undone;
6572          * bypass this by setting the outermost saved $^R to the latest
6573          * value */
6574         if (oreplsv != GvSV(PL_replgv))
6575             sv_setsv(oreplsv, GvSV(PL_replgv));
6576     }
6577     result = 1;
6578     goto final_exit;
6579
6580 no:
6581     DEBUG_EXECUTE_r(
6582         PerlIO_printf(Perl_debug_log,
6583             "%*s  %sfailed...%s\n",
6584             REPORT_CODE_OFF+depth*2, "",
6585             PL_colors[4], PL_colors[5])
6586         );
6587
6588 no_silent:
6589     if (no_final) {
6590         if (yes_state) {
6591             goto yes;
6592         } else {
6593             goto final_exit;
6594         }
6595     }
6596     if (depth) {
6597         /* there's a previous state to backtrack to */
6598         st--;
6599         if (st < SLAB_FIRST(PL_regmatch_slab)) {
6600             PL_regmatch_slab = PL_regmatch_slab->prev;
6601             st = SLAB_LAST(PL_regmatch_slab);
6602         }
6603         PL_regmatch_state = st;
6604         locinput= st->locinput;
6605
6606         DEBUG_STATE_pp("pop");
6607         depth--;
6608         if (yes_state == st)
6609             yes_state = st->u.yes.prev_yes_state;
6610
6611         state_num = st->resume_state + 1; /* failure = success + 1 */
6612         goto reenter_switch;
6613     }
6614     result = 0;
6615
6616   final_exit:
6617     if (rex->intflags & PREGf_VERBARG_SEEN) {
6618         SV *sv_err = get_sv("REGERROR", 1);
6619         SV *sv_mrk = get_sv("REGMARK", 1);
6620         if (result) {
6621             sv_commit = &PL_sv_no;
6622             if (!sv_yes_mark)
6623                 sv_yes_mark = &PL_sv_yes;
6624         } else {
6625             if (!sv_commit)
6626                 sv_commit = &PL_sv_yes;
6627             sv_yes_mark = &PL_sv_no;
6628         }
6629         sv_setsv(sv_err, sv_commit);
6630         sv_setsv(sv_mrk, sv_yes_mark);
6631     }
6632
6633
6634     if (last_pushed_cv) {
6635         dSP;
6636         POP_MULTICALL;
6637         PERL_UNUSED_VAR(SP);
6638     }
6639
6640     /* clean up; in particular, free all slabs above current one */
6641     LEAVE_SCOPE(oldsave);
6642
6643     assert(!result ||  locinput - PL_bostr >= 0);
6644     return result ?  locinput - PL_bostr : -1;
6645 }
6646
6647 /*
6648  - regrepeat - repeatedly match something simple, report how many
6649  *
6650  * What 'simple' means is a node which can be the operand of a quantifier like
6651  * '+', or {1,3}
6652  *
6653  * startposp - pointer a pointer to the start position.  This is updated
6654  *             to point to the byte following the highest successful
6655  *             match.
6656  * p         - the regnode to be repeatedly matched against.
6657  * max       - maximum number of things to match.
6658  * depth     - (for debugging) backtracking depth.
6659  */
6660 STATIC I32
6661 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6662 {
6663     dVAR;
6664     char *scan;     /* Pointer to current position in target string */
6665     I32 c;
6666     char *loceol = PL_regeol;   /* local version */
6667     I32 hardcount = 0;  /* How many matches so far */
6668     bool utf8_target = PL_reg_match_utf8;
6669     UV utf8_flags;
6670 #ifndef DEBUGGING
6671     PERL_UNUSED_ARG(depth);
6672 #endif
6673
6674     PERL_ARGS_ASSERT_REGREPEAT;
6675
6676     scan = *startposp;
6677     if (max == REG_INFTY)
6678         max = I32_MAX;
6679     else if (! utf8_target && scan + max < loceol)
6680         loceol = scan + max;
6681
6682     /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6683      * to the maximum of how far we should go in it (leaving it set to the real
6684      * end, if the maximum permissible would take us beyond that).  This allows
6685      * us to make the loop exit condition that we haven't gone past <loceol> to
6686      * also mean that we haven't exceeded the max permissible count, saving a
6687      * test each time through the loop.  But it assumes that the OP matches a
6688      * single byte, which is true for most of the OPs below when applied to a
6689      * non-UTF-8 target.  Those relatively few OPs that don't have this
6690      * characteristic will have to compensate.
6691      *
6692      * There is no adjustment for UTF-8 targets, as the number of bytes per
6693      * character varies.  OPs will have to test both that the count is less
6694      * than the max permissible (using <hardcount> to keep track), and that we
6695      * are still within the bounds of the string (using <loceol>.  A few OPs
6696      * match a single byte no matter what the encoding.  They can omit the max
6697      * test if, for the UTF-8 case, they do the adjustment that was skipped
6698      * above.
6699      *
6700      * Thus, the code above sets things up for the common case; and exceptional
6701      * cases need extra work; the common case is to make sure <scan> doesn't
6702      * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6703      * count doesn't exceed the maximum permissible */
6704
6705     switch (OP(p)) {
6706     case REG_ANY:
6707         if (utf8_target) {
6708             while (scan < loceol && hardcount < max && *scan != '\n') {
6709                 scan += UTF8SKIP(scan);
6710                 hardcount++;
6711             }
6712         } else {
6713             while (scan < loceol && *scan != '\n')
6714                 scan++;
6715         }
6716         break;
6717     case SANY:
6718         if (utf8_target) {
6719             while (scan < loceol && hardcount < max) {
6720                 scan += UTF8SKIP(scan);
6721                 hardcount++;
6722             }
6723         }
6724         else
6725             scan = loceol;
6726         break;
6727     case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6728         if (utf8_target && scan + max < loceol) {
6729
6730             /* <loceol> hadn't been adjusted in the UTF-8 case */
6731             scan +=  max;
6732         }
6733         else {
6734             scan = loceol;
6735         }
6736         break;
6737     case EXACT:
6738         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6739
6740         c = (U8)*STRING(p);
6741
6742         /* Can use a simple loop if the pattern char to match on is invariant
6743          * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6744          * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6745          * true iff it doesn't matter if the argument is in UTF-8 or not */
6746         if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6747             if (utf8_target && scan + max < loceol) {
6748                 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6749                  * since here, to match at all, 1 char == 1 byte */
6750                 loceol = scan + max;
6751             }
6752             while (scan < loceol && UCHARAT(scan) == c) {
6753                 scan++;
6754             }
6755         }
6756         else if (UTF_PATTERN) {
6757             if (utf8_target) {
6758                 STRLEN scan_char_len;
6759
6760                 /* When both target and pattern are UTF-8, we have to do
6761                  * string EQ */
6762                 while (hardcount < max
6763                        && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6764                        && scan_char_len <= STR_LEN(p)
6765                        && memEQ(scan, STRING(p), scan_char_len))
6766                 {
6767                     scan += scan_char_len;
6768                     hardcount++;
6769                 }
6770             }
6771             else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6772
6773                 /* Target isn't utf8; convert the character in the UTF-8
6774                  * pattern to non-UTF8, and do a simple loop */
6775                 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6776                 while (scan < loceol && UCHARAT(scan) == c) {
6777                     scan++;
6778                 }
6779             } /* else pattern char is above Latin1, can't possibly match the
6780                  non-UTF-8 target */
6781         }
6782         else {
6783
6784             /* Here, the string must be utf8; pattern isn't, and <c> is
6785              * different in utf8 than not, so can't compare them directly.
6786              * Outside the loop, find the two utf8 bytes that represent c, and
6787              * then look for those in sequence in the utf8 string */
6788             U8 high = UTF8_TWO_BYTE_HI(c);
6789             U8 low = UTF8_TWO_BYTE_LO(c);
6790
6791             while (hardcount < max
6792                     && scan + 1 < loceol
6793                     && UCHARAT(scan) == high
6794                     && UCHARAT(scan + 1) == low)
6795             {
6796                 scan += 2;
6797                 hardcount++;
6798             }
6799         }
6800         break;
6801
6802     case EXACTFA:
6803         utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6804         goto do_exactf;
6805
6806     case EXACTFL:
6807         PL_reg_flags |= RF_tainted;
6808         utf8_flags = FOLDEQ_UTF8_LOCALE;
6809         goto do_exactf;
6810
6811     case EXACTF:
6812             utf8_flags = 0;
6813             goto do_exactf;
6814
6815     case EXACTFU_SS:
6816     case EXACTFU_TRICKYFOLD:
6817     case EXACTFU:
6818         utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6819
6820     do_exactf: {
6821         int c1, c2;
6822         U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6823
6824         assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6825
6826         if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6827             if (c1 == CHRTEST_VOID) {
6828                 /* Use full Unicode fold matching */
6829                 char *tmpeol = PL_regeol;
6830                 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6831                 while (hardcount < max
6832                         && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6833                                              STRING(p), NULL, pat_len,
6834                                              cBOOL(UTF_PATTERN), utf8_flags))
6835                 {
6836                     scan = tmpeol;
6837                     tmpeol = PL_regeol;
6838                     hardcount++;
6839                 }
6840             }
6841             else if (utf8_target) {
6842                 if (c1 == c2) {
6843                     while (scan < loceol
6844                            && hardcount < max
6845                            && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6846                     {
6847                         scan += UTF8SKIP(scan);
6848                         hardcount++;
6849                     }
6850                 }
6851                 else {
6852                     while (scan < loceol
6853                            && hardcount < max
6854                            && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6855                                || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6856                     {
6857                         scan += UTF8SKIP(scan);
6858                         hardcount++;
6859                     }
6860                 }
6861             }
6862             else if (c1 == c2) {
6863                 while (scan < loceol && UCHARAT(scan) == c1) {
6864                     scan++;
6865                 }
6866             }
6867             else {
6868                 while (scan < loceol &&
6869                     (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6870                 {
6871                     scan++;
6872                 }
6873             }
6874         }
6875         break;
6876     }
6877     case ANYOF:
6878         if (utf8_target) {
6879             STRLEN inclasslen;
6880             while (hardcount < max
6881                    && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
6882                    && reginclass(prog, p, (U8*)scan, utf8_target))
6883             {
6884                 scan += inclasslen;
6885                 hardcount++;
6886             }
6887         } else {
6888             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6889                 scan++;
6890         }
6891         break;
6892     case ALNUMU:
6893         if (utf8_target) {
6894     utf8_wordchar:
6895             LOAD_UTF8_CHARCLASS_ALNUM();
6896             while (hardcount < max && scan < loceol &&
6897                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6898             {
6899                 scan += UTF8SKIP(scan);
6900                 hardcount++;
6901             }
6902         } else {
6903             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6904                 scan++;
6905             }
6906         }
6907         break;
6908     case ALNUM:
6909         if (utf8_target)
6910             goto utf8_wordchar;
6911         while (scan < loceol && isALNUM((U8) *scan)) {
6912             scan++;
6913         }
6914         break;
6915     case ALNUMA:
6916         if (utf8_target && scan + max < loceol) {
6917
6918             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6919              * since here, to match, 1 char == 1 byte */
6920             loceol = scan + max;
6921         }
6922         while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6923             scan++;
6924         }
6925         break;
6926     case ALNUML:
6927         PL_reg_flags |= RF_tainted;
6928         if (utf8_target) {
6929             while (hardcount < max && scan < loceol &&
6930                    isALNUM_LC_utf8((U8*)scan)) {
6931                 scan += UTF8SKIP(scan);
6932                 hardcount++;
6933             }
6934         } else {
6935             while (scan < loceol && isALNUM_LC(*scan))
6936                 scan++;
6937         }
6938         break;
6939     case NALNUMU:
6940         if (utf8_target) {
6941
6942     utf8_Nwordchar:
6943
6944             LOAD_UTF8_CHARCLASS_ALNUM();
6945             while (hardcount < max && scan < loceol &&
6946                    ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6947             {
6948                 scan += UTF8SKIP(scan);
6949                 hardcount++;
6950             }
6951         } else {
6952             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6953                 scan++;
6954             }
6955         }
6956         break;
6957     case NALNUM:
6958         if (utf8_target)
6959             goto utf8_Nwordchar;
6960         while (scan < loceol && ! isALNUM((U8) *scan)) {
6961             scan++;
6962         }
6963         break;
6964
6965     case POSIXA:
6966         if (utf8_target && scan + max < loceol) {
6967
6968             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6969              * since here, to match, 1 char == 1 byte */
6970             loceol = scan + max;
6971         }
6972         while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6973             scan++;
6974         }
6975         break;
6976     case NPOSIXA:
6977         if (utf8_target) {
6978             while (scan < loceol && hardcount < max
6979                    && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6980             {
6981                 scan += UTF8SKIP(scan);
6982                 hardcount++;
6983             }
6984         }
6985         else {
6986             while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6987                 scan++;
6988             }
6989         }
6990         break;
6991     case NALNUMA:
6992         if (utf8_target) {
6993             while (scan < loceol && hardcount < max
6994                    && ! isWORDCHAR_A((U8) *scan))
6995             {
6996                 scan += UTF8SKIP(scan);
6997                 hardcount++;
6998             }
6999         }
7000         else {
7001             while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
7002                 scan++;
7003             }
7004         }
7005         break;
7006     case NALNUML:
7007         PL_reg_flags |= RF_tainted;
7008         if (utf8_target) {
7009             while (hardcount < max && scan < loceol &&
7010                    !isALNUM_LC_utf8((U8*)scan)) {
7011                 scan += UTF8SKIP(scan);
7012                 hardcount++;
7013             }
7014         } else {
7015             while (scan < loceol && !isALNUM_LC(*scan))
7016                 scan++;
7017         }
7018         break;
7019     case SPACEU:
7020         if (utf8_target) {
7021
7022     utf8_space:
7023
7024             while (hardcount < max && scan < loceol
7025                    && is_XPERLSPACE_utf8((U8*)scan))
7026             {
7027                 scan += UTF8SKIP(scan);
7028                 hardcount++;
7029             }
7030             break;
7031         }
7032         else {
7033             while (scan < loceol && isSPACE_L1((U8) *scan)) {
7034                 scan++;
7035             }
7036             break;
7037         }
7038     case SPACE:
7039         if (utf8_target)
7040             goto utf8_space;
7041
7042         while (scan < loceol && isSPACE((U8) *scan)) {
7043             scan++;
7044         }
7045         break;
7046     case SPACEA:
7047         if (utf8_target && scan + max < loceol) {
7048
7049             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7050              * since here, to match, 1 char == 1 byte */
7051             loceol = scan + max;
7052         }
7053         while (scan < loceol && isSPACE_A((U8) *scan)) {
7054             scan++;
7055         }
7056         break;
7057     case SPACEL:
7058         PL_reg_flags |= RF_tainted;
7059         if (utf8_target) {
7060             while (hardcount < max && scan < loceol &&
7061                    isSPACE_LC_utf8((U8*)scan)) {
7062                 scan += UTF8SKIP(scan);
7063                 hardcount++;
7064             }
7065         } else {
7066             while (scan < loceol && isSPACE_LC(*scan))
7067                 scan++;
7068         }
7069         break;
7070     case NSPACEU:
7071         if (utf8_target) {
7072
7073     utf8_Nspace:
7074
7075             while (hardcount < max && scan < loceol
7076                    && ! is_XPERLSPACE_utf8((U8*)scan))
7077             {
7078                 scan += UTF8SKIP(scan);
7079                 hardcount++;
7080             }
7081             break;
7082         }
7083         else {
7084             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
7085                 scan++;
7086             }
7087         }
7088         break;
7089     case NSPACE:
7090         if (utf8_target)
7091             goto utf8_Nspace;
7092
7093         while (scan < loceol && ! isSPACE((U8) *scan)) {
7094             scan++;
7095         }
7096         break;
7097     case NSPACEA:
7098         if (utf8_target) {
7099             while (hardcount < max && scan < loceol
7100                    && ! isSPACE_A((U8) *scan))
7101             {
7102                 scan += UTF8SKIP(scan);
7103                 hardcount++;
7104             }
7105         }
7106         else {
7107             while (scan < loceol && ! isSPACE_A((U8) *scan)) {
7108                 scan++;
7109             }
7110         }
7111         break;
7112     case NSPACEL:
7113         PL_reg_flags |= RF_tainted;
7114         if (utf8_target) {
7115             while (hardcount < max && scan < loceol &&
7116                    !isSPACE_LC_utf8((U8*)scan)) {
7117                 scan += UTF8SKIP(scan);
7118                 hardcount++;
7119             }
7120         } else {
7121             while (scan < loceol && !isSPACE_LC(*scan))
7122                 scan++;
7123         }
7124         break;
7125     case DIGIT:
7126         if (utf8_target) {
7127             LOAD_UTF8_CHARCLASS_DIGIT();
7128             while (hardcount < max && scan < loceol &&
7129                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7130                 scan += UTF8SKIP(scan);
7131                 hardcount++;
7132             }
7133         } else {
7134             while (scan < loceol && isDIGIT(*scan))
7135                 scan++;
7136         }
7137         break;
7138     case DIGITA:
7139         if (utf8_target && scan + max < loceol) {
7140
7141             /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7142              * since here, to match, 1 char == 1 byte */
7143             loceol = scan + max;
7144         }
7145         while (scan < loceol && isDIGIT_A((U8) *scan)) {
7146             scan++;
7147         }
7148         break;
7149     case DIGITL:
7150         PL_reg_flags |= RF_tainted;
7151         if (utf8_target) {
7152             while (hardcount < max && scan < loceol &&
7153                    isDIGIT_LC_utf8((U8*)scan)) {
7154                 scan += UTF8SKIP(scan);
7155                 hardcount++;
7156             }
7157         } else {
7158             while (scan < loceol && isDIGIT_LC(*scan))
7159                 scan++;
7160         }
7161         break;
7162     case NDIGIT:
7163         if (utf8_target) {
7164             LOAD_UTF8_CHARCLASS_DIGIT();
7165             while (hardcount < max && scan < loceol &&
7166                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7167                 scan += UTF8SKIP(scan);
7168                 hardcount++;
7169             }
7170         } else {
7171             while (scan < loceol && !isDIGIT(*scan))
7172                 scan++;
7173         }
7174         break;
7175     case NDIGITA:
7176         if (utf8_target) {
7177             while (hardcount < max && scan < loceol
7178                    && ! isDIGIT_A((U8) *scan)) {
7179                 scan += UTF8SKIP(scan);
7180                 hardcount++;
7181             }
7182         }
7183         else {
7184             while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7185                 scan++;
7186             }
7187         }
7188         break;
7189     case NDIGITL:
7190         PL_reg_flags |= RF_tainted;
7191         if (utf8_target) {
7192             while (hardcount < max && scan < loceol &&
7193                    !isDIGIT_LC_utf8((U8*)scan)) {
7194                 scan += UTF8SKIP(scan);
7195                 hardcount++;
7196             }
7197         } else {
7198             while (scan < loceol && !isDIGIT_LC(*scan))
7199                 scan++;
7200         }
7201         break;
7202     case LNBREAK:
7203         if (utf8_target) {
7204             while (hardcount < max && scan < loceol &&
7205                     (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7206                 scan += c;
7207                 hardcount++;
7208             }
7209         } else {
7210             /* LNBREAK can match one or two latin chars, which is ok, but we
7211              * have to use hardcount in this situation, and throw away the
7212              * adjustment to <loceol> done before the switch statement */
7213             loceol = PL_regeol;
7214             while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7215                 scan+=c;
7216                 hardcount++;
7217             }
7218         }
7219         break;
7220     case HORIZWS:
7221         if (utf8_target) {
7222             while (hardcount < max && scan < loceol &&
7223                     (c=is_HORIZWS_utf8_safe(scan, loceol)))
7224             {
7225                 scan += c;
7226                 hardcount++;
7227             }
7228         } else {
7229             while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7230                 scan++;
7231         }
7232         break;
7233     case NHORIZWS:
7234         if (utf8_target) {
7235             while (hardcount < max && scan < loceol &&
7236                         !is_HORIZWS_utf8_safe(scan, loceol))
7237             {
7238                 scan += UTF8SKIP(scan);
7239                 hardcount++;
7240             }
7241         } else {
7242             while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7243                 scan++;
7244
7245         }
7246         break;
7247     case VERTWS:
7248         if (utf8_target) {
7249             while (hardcount < max && scan < loceol &&
7250                             (c=is_VERTWS_utf8_safe(scan, loceol)))
7251             {
7252                 scan += c;
7253                 hardcount++;
7254             }
7255         } else {
7256             while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7257                 scan++;
7258
7259         }
7260         break;
7261     case NVERTWS:
7262         if (utf8_target) {
7263             while (hardcount < max && scan < loceol &&
7264                                 !is_VERTWS_utf8_safe(scan, loceol))
7265             {
7266                 scan += UTF8SKIP(scan);
7267                 hardcount++;
7268             }
7269         } else {
7270             while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7271                 scan++;
7272
7273         }
7274         break;
7275
7276     case BOUND:
7277     case BOUNDA:
7278     case BOUNDL:
7279     case BOUNDU:
7280     case EOS:
7281     case GPOS:
7282     case KEEPS:
7283     case NBOUND:
7284     case NBOUNDA:
7285     case NBOUNDL:
7286     case NBOUNDU:
7287     case OPFAIL:
7288     case SBOL:
7289     case SEOL:
7290         /* These are all 0 width, so match right here or not at all. */
7291         break;
7292
7293     default:
7294         Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7295         assert(0); /* NOTREACHED */
7296
7297     }
7298
7299     if (hardcount)
7300         c = hardcount;
7301     else
7302         c = scan - *startposp;
7303     *startposp = scan;
7304
7305     DEBUG_r({
7306         GET_RE_DEBUG_FLAGS_DECL;
7307         DEBUG_EXECUTE_r({
7308             SV * const prop = sv_newmortal();
7309             regprop(prog, prop, p);
7310             PerlIO_printf(Perl_debug_log,
7311                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7312                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7313         });
7314     });
7315
7316     return(c);
7317 }
7318
7319
7320 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7321 /*
7322 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7323 create a copy so that changes the caller makes won't change the shared one.
7324 If <altsvp> is non-null, will return NULL in it, for back-compat.
7325  */
7326 SV *
7327 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7328 {
7329     PERL_ARGS_ASSERT_REGCLASS_SWASH;
7330
7331     if (altsvp) {
7332         *altsvp = NULL;
7333     }
7334
7335     return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7336 }
7337 #endif
7338
7339 STATIC SV *
7340 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7341 {
7342     /* Returns the swash for the input 'node' in the regex 'prog'.
7343      * If <doinit> is true, will attempt to create the swash if not already
7344      *    done.
7345      * If <listsvp> is non-null, will return the swash initialization string in
7346      *    it.
7347      * Tied intimately to how regcomp.c sets up the data structure */
7348
7349     dVAR;
7350     SV *sw  = NULL;
7351     SV *si  = NULL;
7352     SV*  invlist = NULL;
7353
7354     RXi_GET_DECL(prog,progi);
7355     const struct reg_data * const data = prog ? progi->data : NULL;
7356
7357     PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7358
7359     assert(ANYOF_NONBITMAP(node));
7360
7361     if (data && data->count) {
7362         const U32 n = ARG(node);
7363
7364         if (data->what[n] == 's') {
7365             SV * const rv = MUTABLE_SV(data->data[n]);
7366             AV * const av = MUTABLE_AV(SvRV(rv));
7367             SV **const ary = AvARRAY(av);
7368             U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7369
7370             si = *ary;  /* ary[0] = the string to initialize the swash with */
7371
7372             /* Elements 2 and 3 are either both present or both absent. [2] is
7373              * any inversion list generated at compile time; [3] indicates if
7374              * that inversion list has any user-defined properties in it. */
7375             if (av_len(av) >= 2) {
7376                 invlist = ary[2];
7377                 if (SvUV(ary[3])) {
7378                     swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7379                 }
7380             }
7381             else {
7382                 invlist = NULL;
7383             }
7384
7385             /* Element [1] is reserved for the set-up swash.  If already there,
7386              * return it; if not, create it and store it there */
7387             if (SvROK(ary[1])) {
7388                 sw = ary[1];
7389             }
7390             else if (si && doinit) {
7391
7392                 sw = _core_swash_init("utf8", /* the utf8 package */
7393                                       "", /* nameless */
7394                                       si,
7395                                       1, /* binary */
7396                                       0, /* not from tr/// */
7397                                       invlist,
7398                                       &swash_init_flags);
7399                 (void)av_store(av, 1, sw);
7400             }
7401         }
7402     }
7403
7404     if (listsvp) {
7405         SV* matches_string = newSVpvn("", 0);
7406
7407         /* Use the swash, if any, which has to have incorporated into it all
7408          * possibilities */
7409         if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7410             && (si && si != &PL_sv_undef))
7411         {
7412
7413             /* If no swash, use the input initialization string, if available */
7414             sv_catsv(matches_string, si);
7415         }
7416
7417         /* Add the inversion list to whatever we have.  This may have come from
7418          * the swash, or from an input parameter */
7419         if (invlist) {
7420             sv_catsv(matches_string, _invlist_contents(invlist));
7421         }
7422         *listsvp = matches_string;
7423     }
7424
7425     return sw;
7426 }
7427
7428 /*
7429  - reginclass - determine if a character falls into a character class
7430
7431   n is the ANYOF regnode
7432   p is the target string
7433   utf8_target tells whether p is in UTF-8.
7434
7435   Returns true if matched; false otherwise.
7436
7437   Note that this can be a synthetic start class, a combination of various
7438   nodes, so things you think might be mutually exclusive, such as locale,
7439   aren't.  It can match both locale and non-locale
7440
7441  */
7442
7443 STATIC bool
7444 S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7445 {
7446     dVAR;
7447     const char flags = ANYOF_FLAGS(n);
7448     bool match = FALSE;
7449     UV c = *p;
7450
7451     PERL_ARGS_ASSERT_REGINCLASS;
7452
7453     /* If c is not already the code point, get it.  Note that
7454      * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7455     if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7456         STRLEN c_len = 0;
7457         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7458                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7459                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7460                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7461                  * UTF8_ALLOW_FFFF */
7462         if (c_len == (STRLEN)-1)
7463             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7464     }
7465
7466     /* If this character is potentially in the bitmap, check it */
7467     if (c < 256) {
7468         if (ANYOF_BITMAP_TEST(n, c))
7469             match = TRUE;
7470         else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7471                 && ! utf8_target
7472                 && ! isASCII(c))
7473         {
7474             match = TRUE;
7475         }
7476         else if (flags & ANYOF_LOCALE) {
7477             PL_reg_flags |= RF_tainted;
7478
7479             if ((flags & ANYOF_LOC_FOLD)
7480                  && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7481             {
7482                 match = TRUE;
7483             }
7484             else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7485
7486                 /* The data structure is arranged so bits 0, 2, 4, ... are set
7487                  * if the class includes the Posix character class given by
7488                  * bit/2; and 1, 3, 5, ... are set if the class includes the
7489                  * complemented Posix class given by int(bit/2).  So we loop
7490                  * through the bits, each time changing whether we complement
7491                  * the result or not.  Suppose for the sake of illustration
7492                  * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7493                  * is set, it means there is a match for this ANYOF node if the
7494                  * character is in the class given by the expression (0 / 2 = 0
7495                  * = \w).  If it is in that class, isFOO_lc() will return 1,
7496                  * and since 'to_complement' is 0, the result will stay TRUE,
7497                  * and we exit the loop.  Suppose instead that bit 0 is 0, but
7498                  * bit 1 is 1.  That means there is a match if the character
7499                  * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7500                  * but will on bit 1.  On the second iteration 'to_complement'
7501                  * will be 1, so the exclusive or will reverse things, so we
7502                  * are testing for \W.  On the third iteration, 'to_complement'
7503                  * will be 0, and we would be testing for \s; the fourth
7504                  * iteration would test for \S, etc. */
7505
7506                 int count = 0;
7507                 int to_complement = 0;
7508                 while (count < ANYOF_MAX) {
7509                     if (ANYOF_CLASS_TEST(n, count)
7510                         && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7511                     {
7512                         match = TRUE;
7513                         break;
7514                     }
7515                     count++;
7516                     to_complement ^= 1;
7517                 }
7518             }
7519         }
7520     }
7521
7522     /* If the bitmap didn't (or couldn't) match, and something outside the
7523      * bitmap could match, try that.  Locale nodes specify completely the
7524      * behavior of code points in the bit map (otherwise, a utf8 target would
7525      * cause them to be treated as Unicode and not locale), except in
7526      * the very unlikely event when this node is a synthetic start class, which
7527      * could be a combination of locale and non-locale nodes.  So allow locale
7528      * to match for the synthetic start class, which will give a false
7529      * positive that will be resolved when the match is done again as not part
7530      * of the synthetic start class */
7531     if (!match) {
7532         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7533             match = TRUE;       /* Everything above 255 matches */
7534         }
7535         else if (ANYOF_NONBITMAP(n)
7536                  && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7537                      || (utf8_target
7538                          && (c >=256
7539                              || (! (flags & ANYOF_LOCALE))
7540                              || (flags & ANYOF_IS_SYNTHETIC)))))
7541         {
7542             SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7543             if (sw) {
7544                 U8 * utf8_p;
7545                 if (utf8_target) {
7546                     utf8_p = (U8 *) p;
7547                 } else { /* Convert to utf8 */
7548                     STRLEN len = 1;
7549                     utf8_p = bytes_to_utf8(p, &len);
7550                 }
7551
7552                 if (swash_fetch(sw, utf8_p, TRUE)) {
7553                     match = TRUE;
7554                 }
7555
7556                 /* If we allocated a string above, free it */
7557                 if (! utf8_target) Safefree(utf8_p);
7558             }
7559         }
7560
7561         if (UNICODE_IS_SUPER(c)
7562             && (flags & ANYOF_WARN_SUPER)
7563             && ckWARN_d(WARN_NON_UNICODE))
7564         {
7565             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7566                 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7567         }
7568     }
7569
7570     /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7571     return cBOOL(flags & ANYOF_INVERT) ^ match;
7572 }
7573
7574 STATIC U8 *
7575 S_reghop3(U8 *s, I32 off, const U8* lim)
7576 {
7577     /* return the position 'off' UTF-8 characters away from 's', forward if
7578      * 'off' >= 0, backwards if negative.  But don't go outside of position
7579      * 'lim', which better be < s  if off < 0 */
7580
7581     dVAR;
7582
7583     PERL_ARGS_ASSERT_REGHOP3;
7584
7585     if (off >= 0) {
7586         while (off-- && s < lim) {
7587             /* XXX could check well-formedness here */
7588             s += UTF8SKIP(s);
7589         }
7590     }
7591     else {
7592         while (off++ && s > lim) {
7593             s--;
7594             if (UTF8_IS_CONTINUED(*s)) {
7595                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7596                     s--;
7597             }
7598             /* XXX could check well-formedness here */
7599         }
7600     }
7601     return s;
7602 }
7603
7604 #ifdef XXX_dmq
7605 /* there are a bunch of places where we use two reghop3's that should
7606    be replaced with this routine. but since thats not done yet
7607    we ifdef it out - dmq
7608 */
7609 STATIC U8 *
7610 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7611 {
7612     dVAR;
7613
7614     PERL_ARGS_ASSERT_REGHOP4;
7615
7616     if (off >= 0) {
7617         while (off-- && s < rlim) {
7618             /* XXX could check well-formedness here */
7619             s += UTF8SKIP(s);
7620         }
7621     }
7622     else {
7623         while (off++ && s > llim) {
7624             s--;
7625             if (UTF8_IS_CONTINUED(*s)) {
7626                 while (s > llim && UTF8_IS_CONTINUATION(*s))
7627                     s--;
7628             }
7629             /* XXX could check well-formedness here */
7630         }
7631     }
7632     return s;
7633 }
7634 #endif
7635
7636 STATIC U8 *
7637 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7638 {
7639     dVAR;
7640
7641     PERL_ARGS_ASSERT_REGHOPMAYBE3;
7642
7643     if (off >= 0) {
7644         while (off-- && s < lim) {
7645             /* XXX could check well-formedness here */
7646             s += UTF8SKIP(s);
7647         }
7648         if (off >= 0)
7649             return NULL;
7650     }
7651     else {
7652         while (off++ && s > lim) {
7653             s--;
7654             if (UTF8_IS_CONTINUED(*s)) {
7655                 while (s > lim && UTF8_IS_CONTINUATION(*s))
7656                     s--;
7657             }
7658             /* XXX could check well-formedness here */
7659         }
7660         if (off <= 0)
7661             return NULL;
7662     }
7663     return s;
7664 }
7665
7666 static void
7667 restore_pos(pTHX_ void *arg)
7668 {
7669     dVAR;
7670     regexp * const rex = (regexp *)arg;
7671     if (PL_reg_state.re_state_eval_setup_done) {
7672         if (PL_reg_oldsaved) {
7673             rex->subbeg = PL_reg_oldsaved;
7674             rex->sublen = PL_reg_oldsavedlen;
7675             rex->suboffset = PL_reg_oldsavedoffset;
7676             rex->subcoffset = PL_reg_oldsavedcoffset;
7677 #ifdef PERL_ANY_COW
7678             rex->saved_copy = PL_nrs;
7679 #endif
7680             RXp_MATCH_COPIED_on(rex);
7681         }
7682         PL_reg_magic->mg_len = PL_reg_oldpos;
7683         PL_reg_state.re_state_eval_setup_done = FALSE;
7684         PL_curpm = PL_reg_oldcurpm;
7685     }
7686 }
7687
7688 STATIC void
7689 S_to_utf8_substr(pTHX_ regexp *prog)
7690 {
7691     /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7692      * on the converted value */
7693
7694     int i = 1;
7695
7696     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7697
7698     do {
7699         if (prog->substrs->data[i].substr
7700             && !prog->substrs->data[i].utf8_substr) {
7701             SV* const sv = newSVsv(prog->substrs->data[i].substr);
7702             prog->substrs->data[i].utf8_substr = sv;
7703             sv_utf8_upgrade(sv);
7704             if (SvVALID(prog->substrs->data[i].substr)) {
7705                 if (SvTAIL(prog->substrs->data[i].substr)) {
7706                     /* Trim the trailing \n that fbm_compile added last
7707                        time.  */
7708                     SvCUR_set(sv, SvCUR(sv) - 1);
7709                     /* Whilst this makes the SV technically "invalid" (as its
7710                        buffer is no longer followed by "\0") when fbm_compile()
7711                        adds the "\n" back, a "\0" is restored.  */
7712                     fbm_compile(sv, FBMcf_TAIL);
7713                 } else
7714                     fbm_compile(sv, 0);
7715             }
7716             if (prog->substrs->data[i].substr == prog->check_substr)
7717                 prog->check_utf8 = sv;
7718         }
7719     } while (i--);
7720 }
7721
7722 STATIC bool
7723 S_to_byte_substr(pTHX_ regexp *prog)
7724 {
7725     /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7726      * on the converted value; returns FALSE if can't be converted. */
7727
7728     dVAR;
7729     int i = 1;
7730
7731     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7732
7733     do {
7734         if (prog->substrs->data[i].utf8_substr
7735             && !prog->substrs->data[i].substr) {
7736             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7737             if (! sv_utf8_downgrade(sv, TRUE)) {
7738                 return FALSE;
7739             }
7740             if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7741                 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7742                     /* Trim the trailing \n that fbm_compile added last
7743                         time.  */
7744                     SvCUR_set(sv, SvCUR(sv) - 1);
7745                     fbm_compile(sv, FBMcf_TAIL);
7746                 } else
7747                     fbm_compile(sv, 0);
7748             }
7749             prog->substrs->data[i].substr = sv;
7750             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7751                 prog->check_substr = sv;
7752         }
7753     } while (i--);
7754
7755     return TRUE;
7756 }
7757
7758 /*
7759  * Local variables:
7760  * c-indentation-style: bsd
7761  * c-basic-offset: 4
7762  * indent-tabs-mode: nil
7763  * End:
7764  *
7765  * ex: set ts=8 sts=4 sw=4 et:
7766  */