src/5013006/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1               /* tainted information used? */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
  99
 100 /*
 101  * Forwards.
 102  */
 103
 104 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 105 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 106
 107 #define HOPc(pos,off) \
 108         (char *)(PL_reg_match_utf8 \
 109             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 110             : (U8*)(pos + off))
 111 #define HOPBACKc(pos, off) \
 112         (char*)(PL_reg_match_utf8\
 113             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 114             : (pos - off >= PL_bostr)           \
 115                 ? (U8*)pos - off                \
 116                 : NULL)
 117
 118 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 119 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 120
 121 /* these are unrolled below in the CCC_TRY_XXX defined */
 122 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 123     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 124
 125 /* Doesn't do an assert to verify that is correct */
 126 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 127     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 128
 129 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 130 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 131 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 132
 133 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 134         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 135         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 136         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 137             * assert should likely and hopefully fail on an EBCDIC machine */ \
 138         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 139                                                                             \
 140         /* No asserts are done for these, in case called on an early        \
 141             * Unicode version in which they map to nothing */               \
 142         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 143         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 144         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 149
 150 /*
 151    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 152    so that it is possible to override the option here without having to
 153    rebuild the entire core. as we are required to do if we change regcomp.h
 154    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 155 */
 156 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 157 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 158 #endif
 159
 160 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 161 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 162 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 163 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 164 #define RE_utf8_perl_word   PL_utf8_alnum
 165 #define RE_utf8_perl_space  PL_utf8_space
 166 #define RE_utf8_posix_digit PL_utf8_digit
 167 #define perl_word  alnum
 168 #define perl_space space
 169 #define posix_digit digit
 170 #else
 171 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 172 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 173 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 174 #define RE_utf8_perl_word   PL_utf8_perl_word
 175 #define RE_utf8_perl_space  PL_utf8_perl_space
 176 #define RE_utf8_posix_digit PL_utf8_posix_digit
 177 #endif
 178
 179
 180 #define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
 181         case NAMEL:                                                         \
 182             PL_reg_flags |= RF_tainted;                                     \
 183             /* FALL THROUGH */                                              \
 184         case NAME:                                                          \
 185             if (!nextchr)                                                   \
 186                 sayNO;                                                      \
 187             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
 188                 if (!CAT2(PL_utf8_,CLASS)) {                                \
 189                     bool ok;                                                \
 190                     ENTER;                                                  \
 191                     save_re_context();                                      \
 192                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
 193                     assert(ok);                                             \
 194                     LEAVE;                                                  \
 195                 }                                                           \
 196                 if (!(OP(scan) == NAME                                      \
 197                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 198                     : LCFUNC_utf8((U8*)locinput)))                          \
 199                 {                                                           \
 200                     sayNO;                                                  \
 201                 }                                                           \
 202                 locinput += PL_utf8skip[nextchr];                           \
 203                 nextchr = UCHARAT(locinput);                                \
 204                 break;                                                      \
 205             }                                                               \
 206             /* Drops through to the macro that calls this one */
 207
 208 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
 209     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
 210             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
 211                 sayNO;                                                      \
 212             nextchr = UCHARAT(++locinput);                                  \
 213             break
 214
 215 /* Almost identical to the above, but has a case for a node that matches chars
 216  * between 128 and 255 using Unicode (latin1) semantics. */
 217 #define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 218     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 219             if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 220                 sayNO;                                                       \
 221             nextchr = UCHARAT(++locinput);                                   \
 222             break
 223
 224 #define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
 225         case NAMEL:                                                          \
 226             PL_reg_flags |= RF_tainted;                                      \
 227             /* FALL THROUGH */                                               \
 228         case NAME :                                                          \
 229             if (!nextchr && locinput >= PL_regeol)                           \
 230                 sayNO;                                                       \
 231             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
 232                 if (!CAT2(PL_utf8_,CLASS)) {                                 \
 233                     bool ok;                                                 \
 234                     ENTER;                                                   \
 235                     save_re_context();                                       \
 236                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
 237                     assert(ok);                                              \
 238                     LEAVE;                                                   \
 239                 }                                                            \
 240                 if ((OP(scan) == NAME                                        \
 241                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 242                     : LCFUNC_utf8((U8*)locinput)))                           \
 243                 {                                                            \
 244                     sayNO;                                                   \
 245                 }                                                            \
 246                 locinput += PL_utf8skip[nextchr];                            \
 247                 nextchr = UCHARAT(locinput);                                 \
 248                 break;                                                       \
 249             }
 250
 251 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
 252     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 253             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
 254                 sayNO;                                                       \
 255             nextchr = UCHARAT(++locinput);                                   \
 256             break
 257
 258
 259 #define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 260     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
 261             if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 262                 sayNO;                                                       \
 263             nextchr = UCHARAT(++locinput);                                   \
 264             break
 265
 266
 267
 268 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 269
 270 /* for use after a quantifier and before an EXACT-like node -- japhy */
 271 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 272  *
 273  * NOTE that *nothing* that affects backtracking should be in here, specifically
 274  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 275  * node that is in between two EXACT like nodes when ascertaining what the required
 276  * "follow" character is. This should probably be moved to regex compile time
 277  * although it may be done at run time beause of the REF possibility - more
 278  * investigation required. -- demerphq
 279 */
 280 #define JUMPABLE(rn) (      \
 281     OP(rn) == OPEN ||       \
 282     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 283     OP(rn) == EVAL ||   \
 284     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 285     OP(rn) == PLUS || OP(rn) == MINMOD || \
 286     OP(rn) == KEEPS || \
 287     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 288 )
 289 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 290
 291 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 292
 293 #if 0
 294 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 295    we don't need this definition. */
 296 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 297 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 298 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 299
 300 #else
 301 /* ... so we use this as its faster. */
 302 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 303 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 304 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 305
 306 #endif
 307
 308 /*
 309   Search for mandatory following text node; for lookahead, the text must
 310   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 311 */
 312 #define FIND_NEXT_IMPT(rn) STMT_START { \
 313     while (JUMPABLE(rn)) { \
 314         const OPCODE type = OP(rn); \
 315         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 316             rn = NEXTOPER(NEXTOPER(rn)); \
 317         else if (type == PLUS) \
 318             rn = NEXTOPER(rn); \
 319         else if (type == IFMATCH) \
 320             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 321         else rn += NEXT_OFF(rn); \
 322     } \
 323 } STMT_END
 324
 325
 326 static void restore_pos(pTHX_ void *arg);
 327
 328 #define REGCP_PAREN_ELEMS 4
 329 #define REGCP_OTHER_ELEMS 5
 330 #define REGCP_FRAME_ELEMS 1
 331 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 332  * are needed for the regexp context stack bookkeeping. */
 333
 334 STATIC CHECKPOINT
 335 S_regcppush(pTHX_ I32 parenfloor)
 336 {
 337     dVAR;
 338     const int retval = PL_savestack_ix;
 339     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 340     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 341     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 342     int p;
 343     GET_RE_DEBUG_FLAGS_DECL;
 344
 345     if (paren_elems_to_push < 0)
 346         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 347
 348     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 349         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 350                    " out of range (%lu-%ld)",
 351                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 352
 353     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 354
 355     for (p = PL_regsize; p > parenfloor; p--) {
 356 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 357         SSPUSHINT(PL_regoffs[p].end);
 358         SSPUSHINT(PL_regoffs[p].start);
 359         SSPUSHPTR(PL_reg_start_tmp[p]);
 360         SSPUSHINT(p);
 361         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 362           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 363                       (UV)p, (IV)PL_regoffs[p].start,
 364                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 365                       (IV)PL_regoffs[p].end
 366         ));
 367     }
 368 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 369     SSPUSHPTR(PL_regoffs);
 370     SSPUSHINT(PL_regsize);
 371     SSPUSHINT(*PL_reglastparen);
 372     SSPUSHINT(*PL_reglastcloseparen);
 373     SSPUSHPTR(PL_reginput);
 374     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 375
 376     return retval;
 377 }
 378
 379 /* These are needed since we do not localize EVAL nodes: */
 380 #define REGCP_SET(cp)                                           \
 381     DEBUG_STATE_r(                                              \
 382             PerlIO_printf(Perl_debug_log,                       \
 383                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 384                 (IV)PL_savestack_ix));                          \
 385     cp = PL_savestack_ix
 386
 387 #define REGCP_UNWIND(cp)                                        \
 388     DEBUG_STATE_r(                                              \
 389         if (cp != PL_savestack_ix)                              \
 390             PerlIO_printf(Perl_debug_log,                       \
 391                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 392                 (IV)(cp), (IV)PL_savestack_ix));                \
 393     regcpblow(cp)
 394
 395 STATIC char *
 396 S_regcppop(pTHX_ const regexp *rex)
 397 {
 398     dVAR;
 399     UV i;
 400     char *input;
 401     GET_RE_DEBUG_FLAGS_DECL;
 402
 403     PERL_ARGS_ASSERT_REGCPPOP;
 404
 405     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 406     i = SSPOPUV;
 407     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 408     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 409     input = (char *) SSPOPPTR;
 410     *PL_reglastcloseparen = SSPOPINT;
 411     *PL_reglastparen = SSPOPINT;
 412     PL_regsize = SSPOPINT;
 413     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 414
 415     i -= REGCP_OTHER_ELEMS;
 416     /* Now restore the parentheses context. */
 417     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 418         I32 tmps;
 419         U32 paren = (U32)SSPOPINT;
 420         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 421         PL_regoffs[paren].start = SSPOPINT;
 422         tmps = SSPOPINT;
 423         if (paren <= *PL_reglastparen)
 424             PL_regoffs[paren].end = tmps;
 425         DEBUG_BUFFERS_r(
 426             PerlIO_printf(Perl_debug_log,
 427                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 428                           (UV)paren, (IV)PL_regoffs[paren].start,
 429                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 430                           (IV)PL_regoffs[paren].end,
 431                           (paren > *PL_reglastparen ? "(no)" : ""));
 432         );
 433     }
 434     DEBUG_BUFFERS_r(
 435         if (*PL_reglastparen + 1 <= rex->nparens) {
 436             PerlIO_printf(Perl_debug_log,
 437                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 438                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 439         }
 440     );
 441 #if 1
 442     /* It would seem that the similar code in regtry()
 443      * already takes care of this, and in fact it is in
 444      * a better location to since this code can #if 0-ed out
 445      * but the code in regtry() is needed or otherwise tests
 446      * requiring null fields (pat.t#187 and split.t#{13,14}
 447      * (as of patchlevel 7877)  will fail.  Then again,
 448      * this code seems to be necessary or otherwise
 449      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 450      * --jhi updated by dapm */
 451     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 452         if (i > PL_regsize)
 453             PL_regoffs[i].start = -1;
 454         PL_regoffs[i].end = -1;
 455     }
 456 #endif
 457     return input;
 458 }
 459
 460 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 461
 462 /*
 463  * pregexec and friends
 464  */
 465
 466 #ifndef PERL_IN_XSUB_RE
 467 /*
 468  - pregexec - match a regexp against a string
 469  */
 470 I32
 471 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 472          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 473 /* strend: pointer to null at end of string */
 474 /* strbeg: real beginning of string */
 475 /* minend: end of match must be >=minend after stringarg. */
 476 /* nosave: For optimizations. */
 477 {
 478     PERL_ARGS_ASSERT_PREGEXEC;
 479
 480     return
 481         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 482                       nosave ? 0 : REXEC_COPY_STR);
 483 }
 484 #endif
 485
 486 /*
 487  * Need to implement the following flags for reg_anch:
 488  *
 489  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 490  * USE_INTUIT_ML
 491  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 492  * INTUIT_AUTORITATIVE_ML
 493  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 494  * INTUIT_ONCE_ML
 495  *
 496  * Another flag for this function: SECOND_TIME (so that float substrs
 497  * with giant delta may be not rechecked).
 498  */
 499
 500 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 501
 502 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 503    Otherwise, only SvCUR(sv) is used to get strbeg. */
 504
 505 /* XXXX We assume that strpos is strbeg unless sv. */
 506
 507 /* XXXX Some places assume that there is a fixed substring.
 508         An update may be needed if optimizer marks as "INTUITable"
 509         RExen without fixed substrings.  Similarly, it is assumed that
 510         lengths of all the strings are no more than minlen, thus they
 511         cannot come from lookahead.
 512         (Or minlen should take into account lookahead.)
 513   NOTE: Some of this comment is not correct. minlen does now take account
 514   of lookahead/behind. Further research is required. -- demerphq
 515
 516 */
 517
 518 /* A failure to find a constant substring means that there is no need to make
 519    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 520    finding a substring too deep into the string means that less calls to
 521    regtry() should be needed.
 522
 523    REx compiler's optimizer found 4 possible hints:
 524         a) Anchored substring;
 525         b) Fixed substring;
 526         c) Whether we are anchored (beginning-of-line or \G);
 527         d) First node (of those at offset 0) which may distingush positions;
 528    We use a)b)d) and multiline-part of c), and try to find a position in the
 529    string which does not contradict any of them.
 530  */
 531
 532 /* Most of decisions we do here should have been done at compile time.
 533    The nodes of the REx which we used for the search should have been
 534    deleted from the finite automaton. */
 535
 536 char *
 537 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 538                      char *strend, const U32 flags, re_scream_pos_data *data)
 539 {
 540     dVAR;
 541     struct regexp *const prog = (struct regexp *)SvANY(rx);
 542     register I32 start_shift = 0;
 543     /* Should be nonnegative! */
 544     register I32 end_shift   = 0;
 545     register char *s;
 546     register SV *check;
 547     char *strbeg;
 548     char *t;
 549     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 550     I32 ml_anch;
 551     register char *other_last = NULL;   /* other substr checked before this */
 552     char *check_at = NULL;              /* check substr found at this pos */
 553     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 554     RXi_GET_DECL(prog,progi);
 555 #ifdef DEBUGGING
 556     const char * const i_strpos = strpos;
 557 #endif
 558     GET_RE_DEBUG_FLAGS_DECL;
 559
 560     PERL_ARGS_ASSERT_RE_INTUIT_START;
 561
 562     RX_MATCH_UTF8_set(rx,utf8_target);
 563
 564     if (RX_UTF8(rx)) {
 565         PL_reg_flags |= RF_utf8;
 566     }
 567     DEBUG_EXECUTE_r(
 568         debug_start_match(rx, utf8_target, strpos, strend,
 569             sv ? "Guessing start of match in sv for"
 570                : "Guessing start of match in string for");
 571               );
 572
 573     /* CHR_DIST() would be more correct here but it makes things slow. */
 574     if (prog->minlen > strend - strpos) {
 575         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 576                               "String too short... [re_intuit_start]\n"));
 577         goto fail;
 578     }
 579
 580     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 581     PL_regeol = strend;
 582     if (utf8_target) {
 583         if (!prog->check_utf8 && prog->check_substr)
 584             to_utf8_substr(prog);
 585         check = prog->check_utf8;
 586     } else {
 587         if (!prog->check_substr && prog->check_utf8)
 588             to_byte_substr(prog);
 589         check = prog->check_substr;
 590     }
 591     if (check == &PL_sv_undef) {
 592         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 593                 "Non-utf8 string cannot match utf8 check string\n"));
 594         goto fail;
 595     }
 596     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 597         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 598                      || ( (prog->extflags & RXf_ANCH_BOL)
 599                           && !multiline ) );    /* Check after \n? */
 600
 601         if (!ml_anch) {
 602           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 603                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 604                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 605                && sv && !SvROK(sv)
 606                && (strpos != strbeg)) {
 607               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 608               goto fail;
 609           }
 610           if (prog->check_offset_min == prog->check_offset_max &&
 611               !(prog->extflags & RXf_CANY_SEEN)) {
 612             /* Substring at constant offset from beg-of-str... */
 613             I32 slen;
 614
 615             s = HOP3c(strpos, prog->check_offset_min, strend);
 616
 617             if (SvTAIL(check)) {
 618                 slen = SvCUR(check);    /* >= 1 */
 619
 620                 if ( strend - s > slen || strend - s < slen - 1
 621                      || (strend - s == slen && strend[-1] != '\n')) {
 622                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 623                     goto fail_finish;
 624                 }
 625                 /* Now should match s[0..slen-2] */
 626                 slen--;
 627                 if (slen && (*SvPVX_const(check) != *s
 628                              || (slen > 1
 629                                  && memNE(SvPVX_const(check), s, slen)))) {
 630                   report_neq:
 631                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 632                     goto fail_finish;
 633                 }
 634             }
 635             else if (*SvPVX_const(check) != *s
 636                      || ((slen = SvCUR(check)) > 1
 637                          && memNE(SvPVX_const(check), s, slen)))
 638                 goto report_neq;
 639             check_at = s;
 640             goto success_at_start;
 641           }
 642         }
 643         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 644         s = strpos;
 645         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 646         end_shift = prog->check_end_shift;
 647
 648         if (!ml_anch) {
 649             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 650                                          - (SvTAIL(check) != 0);
 651             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 652
 653             if (end_shift < eshift)
 654                 end_shift = eshift;
 655         }
 656     }
 657     else {                              /* Can match at random position */
 658         ml_anch = 0;
 659         s = strpos;
 660         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 661         end_shift = prog->check_end_shift;
 662
 663         /* end shift should be non negative here */
 664     }
 665
 666 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 667     if (end_shift < 0)
 668         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 669                    (IV)end_shift, RX_PRECOMP(prog));
 670 #endif
 671
 672   restart:
 673     /* Find a possible match in the region s..strend by looking for
 674        the "check" substring in the region corrected by start/end_shift. */
 675
 676     {
 677         I32 srch_start_shift = start_shift;
 678         I32 srch_end_shift = end_shift;
 679         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 680             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 681             srch_start_shift = strbeg - s;
 682         }
 683     DEBUG_OPTIMISE_MORE_r({
 684         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 685             (IV)prog->check_offset_min,
 686             (IV)srch_start_shift,
 687             (IV)srch_end_shift,
 688             (IV)prog->check_end_shift);
 689     });
 690
 691     if (flags & REXEC_SCREAM) {
 692         I32 p = -1;                     /* Internal iterator of scream. */
 693         I32 * const pp = data ? data->scream_pos : &p;
 694
 695         if (PL_screamfirst[BmRARE(check)] >= 0
 696             || ( BmRARE(check) == '\n'
 697                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 698                  && SvTAIL(check) ))
 699             s = screaminstr(sv, check,
 700                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 701         else
 702             goto fail_finish;
 703         /* we may be pointing at the wrong string */
 704         if (s && RXp_MATCH_COPIED(prog))
 705             s = strbeg + (s - SvPVX_const(sv));
 706         if (data)
 707             *data->scream_olds = s;
 708     }
 709     else {
 710         U8* start_point;
 711         U8* end_point;
 712         if (prog->extflags & RXf_CANY_SEEN) {
 713             start_point= (U8*)(s + srch_start_shift);
 714             end_point= (U8*)(strend - srch_end_shift);
 715         } else {
 716             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 717             end_point= HOP3(strend, -srch_end_shift, strbeg);
 718         }
 719         DEBUG_OPTIMISE_MORE_r({
 720             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 721                 (int)(end_point - start_point),
 722                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 723                 start_point);
 724         });
 725
 726         s = fbm_instr( start_point, end_point,
 727                       check, multiline ? FBMrf_MULTILINE : 0);
 728     }
 729     }
 730     /* Update the count-of-usability, remove useless subpatterns,
 731         unshift s.  */
 732
 733     DEBUG_EXECUTE_r({
 734         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 735             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 736         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 737                           (s ? "Found" : "Did not find"),
 738             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 739                 ? "anchored" : "floating"),
 740             quoted,
 741             RE_SV_TAIL(check),
 742             (s ? " at offset " : "...\n") );
 743     });
 744
 745     if (!s)
 746         goto fail_finish;
 747     /* Finish the diagnostic message */
 748     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 749
 750     /* XXX dmq: first branch is for positive lookbehind...
 751        Our check string is offset from the beginning of the pattern.
 752        So we need to do any stclass tests offset forward from that
 753        point. I think. :-(
 754      */
 755
 756
 757
 758     check_at=s;
 759
 760
 761     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 762        Start with the other substr.
 763        XXXX no SCREAM optimization yet - and a very coarse implementation
 764        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 765                 *always* match.  Probably should be marked during compile...
 766        Probably it is right to do no SCREAM here...
 767      */
 768
 769     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 770                 : (prog->float_substr && prog->anchored_substr))
 771     {
 772         /* Take into account the "other" substring. */
 773         /* XXXX May be hopelessly wrong for UTF... */
 774         if (!other_last)
 775             other_last = strpos;
 776         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 777           do_other_anchored:
 778             {
 779                 char * const last = HOP3c(s, -start_shift, strbeg);
 780                 char *last1, *last2;
 781                 char * const saved_s = s;
 782                 SV* must;
 783
 784                 t = s - prog->check_offset_max;
 785                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 786                     && (!utf8_target
 787                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 788                             && t > strpos)))
 789                     NOOP;
 790                 else
 791                     t = strpos;
 792                 t = HOP3c(t, prog->anchored_offset, strend);
 793                 if (t < other_last)     /* These positions already checked */
 794                     t = other_last;
 795                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 796                 if (last < last1)
 797                     last1 = last;
 798                 /* XXXX It is not documented what units *_offsets are in.
 799                    We assume bytes, but this is clearly wrong.
 800                    Meaning this code needs to be carefully reviewed for errors.
 801                    dmq.
 802                   */
 803
 804                 /* On end-of-str: see comment below. */
 805                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 806                 if (must == &PL_sv_undef) {
 807                     s = (char*)NULL;
 808                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 809                 }
 810                 else
 811                     s = fbm_instr(
 812                         (unsigned char*)t,
 813                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 814                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 815                         must,
 816                         multiline ? FBMrf_MULTILINE : 0
 817                     );
 818                 DEBUG_EXECUTE_r({
 819                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 820                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 821                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 822                         (s ? "Found" : "Contradicts"),
 823                         quoted, RE_SV_TAIL(must));
 824                 });
 825
 826
 827                 if (!s) {
 828                     if (last1 >= last2) {
 829                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 830                                                 ", giving up...\n"));
 831                         goto fail_finish;
 832                     }
 833                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 834                         ", trying floating at offset %ld...\n",
 835                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 836                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 837                     s = HOP3c(last, 1, strend);
 838                     goto restart;
 839                 }
 840                 else {
 841                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 842                           (long)(s - i_strpos)));
 843                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 844                     other_last = HOP3c(s, 1, strend);
 845                     s = saved_s;
 846                     if (t == strpos)
 847                         goto try_at_start;
 848                     goto try_at_offset;
 849                 }
 850             }
 851         }
 852         else {          /* Take into account the floating substring. */
 853             char *last, *last1;
 854             char * const saved_s = s;
 855             SV* must;
 856
 857             t = HOP3c(s, -start_shift, strbeg);
 858             last1 = last =
 859                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 860             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 861                 last = HOP3c(t, prog->float_max_offset, strend);
 862             s = HOP3c(t, prog->float_min_offset, strend);
 863             if (s < other_last)
 864                 s = other_last;
 865  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 866             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 867             /* fbm_instr() takes into account exact value of end-of-str
 868                if the check is SvTAIL(ed).  Since false positives are OK,
 869                and end-of-str is not later than strend we are OK. */
 870             if (must == &PL_sv_undef) {
 871                 s = (char*)NULL;
 872                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 873             }
 874             else
 875                 s = fbm_instr((unsigned char*)s,
 876                               (unsigned char*)last + SvCUR(must)
 877                                   - (SvTAIL(must)!=0),
 878                               must, multiline ? FBMrf_MULTILINE : 0);
 879             DEBUG_EXECUTE_r({
 880                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 881                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 882                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 883                     (s ? "Found" : "Contradicts"),
 884                     quoted, RE_SV_TAIL(must));
 885             });
 886             if (!s) {
 887                 if (last1 == last) {
 888                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 889                                             ", giving up...\n"));
 890                     goto fail_finish;
 891                 }
 892                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 893                     ", trying anchored starting at offset %ld...\n",
 894                     (long)(saved_s + 1 - i_strpos)));
 895                 other_last = last;
 896                 s = HOP3c(t, 1, strend);
 897                 goto restart;
 898             }
 899             else {
 900                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 901                       (long)(s - i_strpos)));
 902                 other_last = s; /* Fix this later. --Hugo */
 903                 s = saved_s;
 904                 if (t == strpos)
 905                     goto try_at_start;
 906                 goto try_at_offset;
 907             }
 908         }
 909     }
 910
 911
 912     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 913
 914     DEBUG_OPTIMISE_MORE_r(
 915         PerlIO_printf(Perl_debug_log,
 916             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 917             (IV)prog->check_offset_min,
 918             (IV)prog->check_offset_max,
 919             (IV)(s-strpos),
 920             (IV)(t-strpos),
 921             (IV)(t-s),
 922             (IV)(strend-strpos)
 923         )
 924     );
 925
 926     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 927         && (!utf8_target
 928             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 929                  && t > strpos)))
 930     {
 931         /* Fixed substring is found far enough so that the match
 932            cannot start at strpos. */
 933       try_at_offset:
 934         if (ml_anch && t[-1] != '\n') {
 935             /* Eventually fbm_*() should handle this, but often
 936                anchored_offset is not 0, so this check will not be wasted. */
 937             /* XXXX In the code below we prefer to look for "^" even in
 938                presence of anchored substrings.  And we search even
 939                beyond the found float position.  These pessimizations
 940                are historical artefacts only.  */
 941           find_anchor:
 942             while (t < strend - prog->minlen) {
 943                 if (*t == '\n') {
 944                     if (t < check_at - prog->check_offset_min) {
 945                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 946                             /* Since we moved from the found position,
 947                                we definitely contradict the found anchored
 948                                substr.  Due to the above check we do not
 949                                contradict "check" substr.
 950                                Thus we can arrive here only if check substr
 951                                is float.  Redo checking for "other"=="fixed".
 952                              */
 953                             strpos = t + 1;
 954                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 955                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 956                             goto do_other_anchored;
 957                         }
 958                         /* We don't contradict the found floating substring. */
 959                         /* XXXX Why not check for STCLASS? */
 960                         s = t + 1;
 961                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 962                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 963                         goto set_useful;
 964                     }
 965                     /* Position contradicts check-string */
 966                     /* XXXX probably better to look for check-string
 967                        than for "\n", so one should lower the limit for t? */
 968                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 969                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 970                     other_last = strpos = s = t + 1;
 971                     goto restart;
 972                 }
 973                 t++;
 974             }
 975             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 976                         PL_colors[0], PL_colors[1]));
 977             goto fail_finish;
 978         }
 979         else {
 980             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 981                         PL_colors[0], PL_colors[1]));
 982         }
 983         s = t;
 984       set_useful:
 985         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 986     }
 987     else {
 988         /* The found string does not prohibit matching at strpos,
 989            - no optimization of calling REx engine can be performed,
 990            unless it was an MBOL and we are not after MBOL,
 991            or a future STCLASS check will fail this. */
 992       try_at_start:
 993         /* Even in this situation we may use MBOL flag if strpos is offset
 994            wrt the start of the string. */
 995         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 996             && (strpos != strbeg) && strpos[-1] != '\n'
 997             /* May be due to an implicit anchor of m{.*foo}  */
 998             && !(prog->intflags & PREGf_IMPLICIT))
 999         {
1000             t = strpos;
1001             goto find_anchor;
1002         }
1003         DEBUG_EXECUTE_r( if (ml_anch)
1004             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1005                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1006         );
1007       success_at_start:
1008         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1009             && (utf8_target ? (
1010                 prog->check_utf8                /* Could be deleted already */
1011                 && --BmUSEFUL(prog->check_utf8) < 0
1012                 && (prog->check_utf8 == prog->float_utf8)
1013             ) : (
1014                 prog->check_substr              /* Could be deleted already */
1015                 && --BmUSEFUL(prog->check_substr) < 0
1016                 && (prog->check_substr == prog->float_substr)
1017             )))
1018         {
1019             /* If flags & SOMETHING - do not do it many times on the same match */
1020             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1021             /* XXX Does the destruction order has to change with utf8_target? */
1022             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1023             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1024             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1025             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1026             check = NULL;                       /* abort */
1027             s = strpos;
1028             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag
1029                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1030             if (prog->intflags & PREGf_IMPLICIT)
1031                 prog->extflags &= ~RXf_ANCH_MBOL;
1032             /* XXXX This is a remnant of the old implementation.  It
1033                     looks wasteful, since now INTUIT can use many
1034                     other heuristics. */
1035             prog->extflags &= ~RXf_USE_INTUIT;
1036             /* XXXX What other flags might need to be cleared in this branch? */
1037         }
1038         else
1039             s = strpos;
1040     }
1041
1042     /* Last resort... */
1043     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1044     /* trie stclasses are too expensive to use here, we are better off to
1045        leave it to regmatch itself */
1046     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1047         /* minlen == 0 is possible if regstclass is \b or \B,
1048            and the fixed substr is ''$.
1049            Since minlen is already taken into account, s+1 is before strend;
1050            accidentally, minlen >= 1 guaranties no false positives at s + 1
1051            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1052            regstclass does not come from lookahead...  */
1053         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1054            This leaves EXACTF only, which is dealt with in find_byclass().  */
1055         const U8* const str = (U8*)STRING(progi->regstclass);
1056         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1057                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1058                     : 1);
1059         char * endpos;
1060         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1061             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1062         else if (prog->float_substr || prog->float_utf8)
1063             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1064         else
1065             endpos= strend;
1066
1067         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1068                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1069
1070         t = s;
1071         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1072         if (!s) {
1073 #ifdef DEBUGGING
1074             const char *what = NULL;
1075 #endif
1076             if (endpos == strend) {
1077                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1078                                 "Could not match STCLASS...\n") );
1079                 goto fail;
1080             }
1081             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1082                                    "This position contradicts STCLASS...\n") );
1083             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1084                 goto fail;
1085             /* Contradict one of substrings */
1086             if (prog->anchored_substr || prog->anchored_utf8) {
1087                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1088                     DEBUG_EXECUTE_r( what = "anchored" );
1089                   hop_and_restart:
1090                     s = HOP3c(t, 1, strend);
1091                     if (s + start_shift + end_shift > strend) {
1092                         /* XXXX Should be taken into account earlier? */
1093                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1094                                                "Could not match STCLASS...\n") );
1095                         goto fail;
1096                     }
1097                     if (!check)
1098                         goto giveup;
1099                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1100                                 "Looking for %s substr starting at offset %ld...\n",
1101                                  what, (long)(s + start_shift - i_strpos)) );
1102                     goto restart;
1103                 }
1104                 /* Have both, check_string is floating */
1105                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1106                     goto retry_floating_check;
1107                 /* Recheck anchored substring, but not floating... */
1108                 s = check_at;
1109                 if (!check)
1110                     goto giveup;
1111                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1112                           "Looking for anchored substr starting at offset %ld...\n",
1113                           (long)(other_last - i_strpos)) );
1114                 goto do_other_anchored;
1115             }
1116             /* Another way we could have checked stclass at the
1117                current position only: */
1118             if (ml_anch) {
1119                 s = t = t + 1;
1120                 if (!check)
1121                     goto giveup;
1122                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1123                           "Looking for /%s^%s/m starting at offset %ld...\n",
1124                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1125                 goto try_at_offset;
1126             }
1127             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1128                 goto fail;
1129             /* Check is floating subtring. */
1130           retry_floating_check:
1131             t = check_at - start_shift;
1132             DEBUG_EXECUTE_r( what = "floating" );
1133             goto hop_and_restart;
1134         }
1135         if (t != s) {
1136             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1137                         "By STCLASS: moving %ld --> %ld\n",
1138                                   (long)(t - i_strpos), (long)(s - i_strpos))
1139                    );
1140         }
1141         else {
1142             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1143                                   "Does not contradict STCLASS...\n");
1144                    );
1145         }
1146     }
1147   giveup:
1148     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1149                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1150                           PL_colors[5], (long)(s - i_strpos)) );
1151     return s;
1152
1153   fail_finish:                          /* Substring not found */
1154     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1155         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1156   fail:
1157     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1158                           PL_colors[4], PL_colors[5]));
1159     return NULL;
1160 }
1161
1162 #define DECL_TRIE_TYPE(scan) \
1163     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1164                     trie_type = (scan->flags != EXACT) \
1165                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1166                               : (utf8_target ? trie_utf8 : trie_plain)
1167
1168 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1169 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1170     switch (trie_type) {                                                    \
1171     case trie_utf8_fold:                                                    \
1172         if ( foldlen>0 ) {                                                  \
1173             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1174             foldlen -= len;                                                 \
1175             uscan += len;                                                   \
1176             len=0;                                                          \
1177         } else {                                                            \
1178             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1179             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1180             foldlen -= UNISKIP( uvc );                                      \
1181             uscan = foldbuf + UNISKIP( uvc );                               \
1182         }                                                                   \
1183         break;                                                              \
1184     case trie_latin_utf8_fold:                                              \
1185         if ( foldlen>0 ) {                                                  \
1186             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1187             foldlen -= len;                                                 \
1188             uscan += len;                                                   \
1189             len=0;                                                          \
1190         } else {                                                            \
1191             len = 1;                                                        \
1192             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1193             foldlen -= UNISKIP( uvc );                                      \
1194             uscan = foldbuf + UNISKIP( uvc );                               \
1195         }                                                                   \
1196         break;                                                              \
1197     case trie_utf8:                                                         \
1198         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1199         break;                                                              \
1200     case trie_plain:                                                        \
1201         uvc = (UV)*uc;                                                      \
1202         len = 1;                                                            \
1203     }                                                                       \
1204     if (uvc < 256) {                                                        \
1205         charid = trie->charmap[ uvc ];                                      \
1206     }                                                                       \
1207     else {                                                                  \
1208         charid = 0;                                                         \
1209         if (widecharmap) {                                                  \
1210             SV** const svpp = hv_fetch(widecharmap,                         \
1211                         (char*)&uvc, sizeof(UV), 0);                        \
1212             if (svpp)                                                       \
1213                 charid = (U16)SvIV(*svpp);                                  \
1214         }                                                                   \
1215     }                                                                       \
1216 } STMT_END
1217
1218 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1219 {                                                      \
1220     char *my_strend= (char *)strend;                   \
1221     if ( (CoNd)                                        \
1222          && (ln == len ||                              \
1223              foldEQ_utf8(s, &my_strend, 0,  utf8_target,   \
1224                         m, NULL, ln, cBOOL(UTF_PATTERN)))      \
1225          && (!reginfo || regtry(reginfo, &s)) )        \
1226         goto got_it;                                   \
1227     else {                                             \
1228          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1229          uvchr_to_utf8(tmpbuf, c);                     \
1230          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1231          if ( f != c                                   \
1232               && (f == c1 || f == c2)                  \
1233               && (ln == len ||                         \
1234                 foldEQ_utf8(s, &my_strend, 0,  utf8_target,\
1235                               m, NULL, ln, cBOOL(UTF_PATTERN)))\
1236               && (!reginfo || regtry(reginfo, &s)) )   \
1237               goto got_it;                             \
1238     }                                                  \
1239 }                                                      \
1240 s += len
1241
1242 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1243 STMT_START {                                              \
1244     while (s <= e) {                                      \
1245         if ( (CoNd)                                       \
1246              && (ln == 1 || (OP(c) == EXACTF             \
1247                               ? foldEQ(s, m, ln)           \
1248                               : foldEQ_locale(s, m, ln)))  \
1249              && (!reginfo || regtry(reginfo, &s)) )        \
1250             goto got_it;                                  \
1251         s++;                                              \
1252     }                                                     \
1253 } STMT_END
1254
1255 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1256 STMT_START {                                          \
1257     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1258         CoDe                                          \
1259         s += uskip;                                   \
1260     }                                                 \
1261 } STMT_END
1262
1263 #define REXEC_FBC_SCAN(CoDe)                          \
1264 STMT_START {                                          \
1265     while (s < strend) {                              \
1266         CoDe                                          \
1267         s++;                                          \
1268     }                                                 \
1269 } STMT_END
1270
1271 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1272 REXEC_FBC_UTF8_SCAN(                                  \
1273     if (CoNd) {                                       \
1274         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1275             goto got_it;                              \
1276         else                                          \
1277             tmp = doevery;                            \
1278     }                                                 \
1279     else                                              \
1280         tmp = 1;                                      \
1281 )
1282
1283 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1284 REXEC_FBC_SCAN(                                       \
1285     if (CoNd) {                                       \
1286         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1287             goto got_it;                              \
1288         else                                          \
1289             tmp = doevery;                            \
1290     }                                                 \
1291     else                                              \
1292         tmp = 1;                                      \
1293 )
1294
1295 #define REXEC_FBC_TRYIT               \
1296 if ((!reginfo || regtry(reginfo, &s))) \
1297     goto got_it
1298
1299 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1300     if (utf8_target) {                                             \
1301         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1302     }                                                          \
1303     else {                                                     \
1304         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1305     }                                                          \
1306     break
1307
1308 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1309     if (utf8_target) {                                             \
1310         UtFpReLoAd;                                            \
1311         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1312     }                                                          \
1313     else {                                                     \
1314         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1315     }                                                          \
1316     break
1317
1318 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1319     PL_reg_flags |= RF_tainted;                                \
1320     if (utf8_target) {                                             \
1321         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1322     }                                                          \
1323     else {                                                     \
1324         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1325     }                                                          \
1326     break
1327
1328 #define DUMP_EXEC_POS(li,s,doutf8) \
1329     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1330
1331 /* We know what class REx starts with.  Try to find this position... */
1332 /* if reginfo is NULL, its a dryrun */
1333 /* annoyingly all the vars in this routine have different names from their counterparts
1334    in regmatch. /grrr */
1335
1336 STATIC char *
1337 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1338     const char *strend, regmatch_info *reginfo)
1339 {
1340         dVAR;
1341         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1342         char *m;
1343         STRLEN ln;
1344         STRLEN lnc;
1345         register STRLEN uskip;
1346         unsigned int c1;
1347         unsigned int c2;
1348         char *e;
1349         register I32 tmp = 1;   /* Scratch variable? */
1350         register const bool utf8_target = PL_reg_match_utf8;
1351         RXi_GET_DECL(prog,progi);
1352
1353         PERL_ARGS_ASSERT_FIND_BYCLASS;
1354
1355         /* We know what class it must start with. */
1356         switch (OP(c)) {
1357         case ANYOF:
1358             if (utf8_target) {
1359                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1360                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1361                           reginclass(prog, c, (U8*)s, 0, utf8_target) :
1362                           REGINCLASS(prog, c, (U8*)s));
1363             }
1364             else {
1365                  while (s < strend) {
1366                       STRLEN skip = 1;
1367
1368                       if (REGINCLASS(prog, c, (U8*)s) ||
1369                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1370                            /* The assignment of 2 is intentional:
1371                             * for the folded sharp s, the skip is 2. */
1372                            (skip = SHARP_S_SKIP))) {
1373                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1374                                 goto got_it;
1375                            else
1376                                 tmp = doevery;
1377                       }
1378                       else
1379                            tmp = 1;
1380                       s += skip;
1381                  }
1382             }
1383             break;
1384         case CANY:
1385             REXEC_FBC_SCAN(
1386                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1387                     goto got_it;
1388                 else
1389                     tmp = doevery;
1390             );
1391             break;
1392         case EXACTF:
1393             m   = STRING(c);
1394             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1395             lnc = (I32) ln;     /* length to match in characters */
1396             if (UTF_PATTERN) {
1397                 STRLEN ulen1, ulen2;
1398                 U8 *sm = (U8 *) m;
1399                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1400                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1401                 /* used by commented-out code below */
1402                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1403
1404                 /* XXX: Since the node will be case folded at compile
1405                    time this logic is a little odd, although im not
1406                    sure that its actually wrong. --dmq */
1407
1408                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1409                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1410
1411                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1412                    codepoint of the first character in the converted
1413                    form, yet originally we did the extra step.
1414                    No tests fail by commenting this code out however
1415                    so Ive left it out. -- dmq.
1416
1417                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1418                                     0, uniflags);
1419                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1420                                     0, uniflags);
1421                 */
1422
1423                 lnc = 0;
1424                 while (sm < ((U8 *) m + ln)) {
1425                     lnc++;
1426                     sm += UTF8SKIP(sm);
1427                 }
1428             }
1429             else {
1430                 c1 = *(U8*)m;
1431                 c2 = PL_fold[c1];
1432             }
1433             goto do_exactf;
1434         case EXACTFL:
1435             m   = STRING(c);
1436             ln  = STR_LEN(c);
1437             lnc = (I32) ln;
1438             c1 = *(U8*)m;
1439             c2 = PL_fold_locale[c1];
1440           do_exactf:
1441             e = HOP3c(strend, -((I32)lnc), s);
1442
1443             if (!reginfo && e < s)
1444                 e = s;                  /* Due to minlen logic of intuit() */
1445
1446             /* The idea in the EXACTF* cases is to first find the
1447              * first character of the EXACTF* node and then, if
1448              * necessary, case-insensitively compare the full
1449              * text of the node.  The c1 and c2 are the first
1450              * characters (though in Unicode it gets a bit
1451              * more complicated because there are more cases
1452              * than just upper and lower: one needs to use
1453              * the so-called folding case for case-insensitive
1454              * matching (called "loose matching" in Unicode).
1455              * foldEQ_utf8() will do just that. */
1456
1457             if (utf8_target || UTF_PATTERN) {
1458                 UV c, f;
1459                 U8 tmpbuf [UTF8_MAXBYTES+1];
1460                 STRLEN len = 1;
1461                 STRLEN foldlen;
1462                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1463                 if (c1 == c2) {
1464                     /* Upper and lower of 1st char are equal -
1465                      * probably not a "letter". */
1466                     while (s <= e) {
1467                         if (utf8_target) {
1468                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1469                                            uniflags);
1470                         } else {
1471                             c = *((U8*)s);
1472                         }
1473                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1474                     }
1475                 }
1476                 else {
1477                     while (s <= e) {
1478                         if (utf8_target) {
1479                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1480                                            uniflags);
1481                         } else {
1482                             c = *((U8*)s);
1483                         }
1484
1485                         /* Handle some of the three Greek sigmas cases.
1486                          * Note that not all the possible combinations
1487                          * are handled here: some of them are handled
1488                          * by the standard folding rules, and some of
1489                          * them (the character class or ANYOF cases)
1490                          * are handled during compiletime in
1491                          * regexec.c:S_regclass(). */
1492                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1493                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1494                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1495
1496                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1497                     }
1498                 }
1499             }
1500             else {
1501                 /* Neither pattern nor string are UTF8 */
1502                 if (c1 == c2)
1503                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1504                 else
1505                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1506             }
1507             break;
1508         case BOUNDL:
1509             PL_reg_flags |= RF_tainted;
1510             /* FALL THROUGH */
1511         case BOUND:
1512             if (utf8_target) {
1513                 if (s == PL_bostr)
1514                     tmp = '\n';
1515                 else {
1516                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1517                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1518                 }
1519                 tmp = ((OP(c) == BOUND ?
1520                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1521                 LOAD_UTF8_CHARCLASS_ALNUM();
1522                 REXEC_FBC_UTF8_SCAN(
1523                     if (tmp == !(OP(c) == BOUND ?
1524                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1525                                  isALNUM_LC_utf8((U8*)s)))
1526                     {
1527                         tmp = !tmp;
1528                         REXEC_FBC_TRYIT;
1529                 }
1530                 );
1531             }
1532             else {  /* Not utf8 */
1533                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1534                 tmp = cBOOL((OP(c) == BOUNDL)
1535                             ? isALNUM_LC(tmp)
1536                             : (isWORDCHAR_L1(tmp)
1537                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1538                 REXEC_FBC_SCAN(
1539                     if (tmp ==
1540                         !((OP(c) == BOUNDL)
1541                           ? isALNUM_LC(*s)
1542                           : (isWORDCHAR_L1((U8) *s)
1543                              && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1544                     {
1545                         tmp = !tmp;
1546                         REXEC_FBC_TRYIT;
1547                 }
1548                 );
1549             }
1550             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1551                 goto got_it;
1552             break;
1553         case NBOUNDL:
1554             PL_reg_flags |= RF_tainted;
1555             /* FALL THROUGH */
1556         case NBOUND:
1557             if (utf8_target) {
1558                 if (s == PL_bostr)
1559                     tmp = '\n';
1560                 else {
1561                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1562                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1563                 }
1564                 tmp = ((OP(c) == NBOUND ?
1565                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1566                 LOAD_UTF8_CHARCLASS_ALNUM();
1567                 REXEC_FBC_UTF8_SCAN(
1568                     if (tmp == !(OP(c) == NBOUND ?
1569                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1570                                  isALNUM_LC_utf8((U8*)s)))
1571                         tmp = !tmp;
1572                     else REXEC_FBC_TRYIT;
1573                 );
1574             }
1575             else {
1576                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1577                 tmp = cBOOL((OP(c) == NBOUNDL)
1578                             ? isALNUM_LC(tmp)
1579                             : (isWORDCHAR_L1(tmp)
1580                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1581                 REXEC_FBC_SCAN(
1582                     if (tmp == ! cBOOL(
1583                             (OP(c) == NBOUNDL)
1584                             ? isALNUM_LC(*s)
1585                             : (isWORDCHAR_L1((U8) *s)
1586                                && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1587                     {
1588                         tmp = !tmp;
1589                     }
1590                     else REXEC_FBC_TRYIT;
1591                 );
1592             }
1593             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1594                 goto got_it;
1595             break;
1596         case ALNUM:
1597             REXEC_FBC_CSCAN_PRELOAD(
1598                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1599                 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1600                 (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
1601             );
1602         case ALNUML:
1603             REXEC_FBC_CSCAN_TAINT(
1604                 isALNUM_LC_utf8((U8*)s),
1605                 isALNUM_LC(*s)
1606             );
1607         case NALNUM:
1608             REXEC_FBC_CSCAN_PRELOAD(
1609                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1610                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1611                 ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
1612             );
1613         case NALNUML:
1614             REXEC_FBC_CSCAN_TAINT(
1615                 !isALNUM_LC_utf8((U8*)s),
1616                 !isALNUM_LC(*s)
1617             );
1618         case SPACE:
1619             REXEC_FBC_CSCAN_PRELOAD(
1620                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1621                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1622                 isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
1623             );
1624         case SPACEL:
1625             REXEC_FBC_CSCAN_TAINT(
1626                 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1627                 isSPACE_LC(*s)
1628             );
1629         case NSPACE:
1630             REXEC_FBC_CSCAN_PRELOAD(
1631                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1632                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1633                 !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
1634             );
1635         case NSPACEL:
1636             REXEC_FBC_CSCAN_TAINT(
1637                 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1638                 !isSPACE_LC(*s)
1639             );
1640         case DIGIT:
1641             REXEC_FBC_CSCAN_PRELOAD(
1642                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1643                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1644                 isDIGIT(*s)
1645             );
1646         case DIGITL:
1647             REXEC_FBC_CSCAN_TAINT(
1648                 isDIGIT_LC_utf8((U8*)s),
1649                 isDIGIT_LC(*s)
1650             );
1651         case NDIGIT:
1652             REXEC_FBC_CSCAN_PRELOAD(
1653                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1654                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1655                 !isDIGIT(*s)
1656             );
1657         case NDIGITL:
1658             REXEC_FBC_CSCAN_TAINT(
1659                 !isDIGIT_LC_utf8((U8*)s),
1660                 !isDIGIT_LC(*s)
1661             );
1662         case LNBREAK:
1663             REXEC_FBC_CSCAN(
1664                 is_LNBREAK_utf8(s),
1665                 is_LNBREAK_latin1(s)
1666             );
1667         case VERTWS:
1668             REXEC_FBC_CSCAN(
1669                 is_VERTWS_utf8(s),
1670                 is_VERTWS_latin1(s)
1671             );
1672         case NVERTWS:
1673             REXEC_FBC_CSCAN(
1674                 !is_VERTWS_utf8(s),
1675                 !is_VERTWS_latin1(s)
1676             );
1677         case HORIZWS:
1678             REXEC_FBC_CSCAN(
1679                 is_HORIZWS_utf8(s),
1680                 is_HORIZWS_latin1(s)
1681             );
1682         case NHORIZWS:
1683             REXEC_FBC_CSCAN(
1684                 !is_HORIZWS_utf8(s),
1685                 !is_HORIZWS_latin1(s)
1686             );
1687         case AHOCORASICKC:
1688         case AHOCORASICK:
1689             {
1690                 DECL_TRIE_TYPE(c);
1691                 /* what trie are we using right now */
1692                 reg_ac_data *aho
1693                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1694                 reg_trie_data *trie
1695                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1696                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1697
1698                 const char *last_start = strend - trie->minlen;
1699 #ifdef DEBUGGING
1700                 const char *real_start = s;
1701 #endif
1702                 STRLEN maxlen = trie->maxlen;
1703                 SV *sv_points;
1704                 U8 **points; /* map of where we were in the input string
1705                                 when reading a given char. For ASCII this
1706                                 is unnecessary overhead as the relationship
1707                                 is always 1:1, but for Unicode, especially
1708                                 case folded Unicode this is not true. */
1709                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1710                 U8 *bitmap=NULL;
1711
1712
1713                 GET_RE_DEBUG_FLAGS_DECL;
1714
1715                 /* We can't just allocate points here. We need to wrap it in
1716                  * an SV so it gets freed properly if there is a croak while
1717                  * running the match */
1718                 ENTER;
1719                 SAVETMPS;
1720                 sv_points=newSV(maxlen * sizeof(U8 *));
1721                 SvCUR_set(sv_points,
1722                     maxlen * sizeof(U8 *));
1723                 SvPOK_on(sv_points);
1724                 sv_2mortal(sv_points);
1725                 points=(U8**)SvPV_nolen(sv_points );
1726                 if ( trie_type != trie_utf8_fold
1727                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1728                 {
1729                     if (trie->bitmap)
1730                         bitmap=(U8*)trie->bitmap;
1731                     else
1732                         bitmap=(U8*)ANYOF_BITMAP(c);
1733                 }
1734                 /* this is the Aho-Corasick algorithm modified a touch
1735                    to include special handling for long "unknown char"
1736                    sequences. The basic idea being that we use AC as long
1737                    as we are dealing with a possible matching char, when
1738                    we encounter an unknown char (and we have not encountered
1739                    an accepting state) we scan forward until we find a legal
1740                    starting char.
1741                    AC matching is basically that of trie matching, except
1742                    that when we encounter a failing transition, we fall back
1743                    to the current states "fail state", and try the current char
1744                    again, a process we repeat until we reach the root state,
1745                    state 1, or a legal transition. If we fail on the root state
1746                    then we can either terminate if we have reached an accepting
1747                    state previously, or restart the entire process from the beginning
1748                    if we have not.
1749
1750                  */
1751                 while (s <= last_start) {
1752                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1753                     U8 *uc = (U8*)s;
1754                     U16 charid = 0;
1755                     U32 base = 1;
1756                     U32 state = 1;
1757                     UV uvc = 0;
1758                     STRLEN len = 0;
1759                     STRLEN foldlen = 0;
1760                     U8 *uscan = (U8*)NULL;
1761                     U8 *leftmost = NULL;
1762 #ifdef DEBUGGING
1763                     U32 accepted_word= 0;
1764 #endif
1765                     U32 pointpos = 0;
1766
1767                     while ( state && uc <= (U8*)strend ) {
1768                         int failed=0;
1769                         U32 word = aho->states[ state ].wordnum;
1770
1771                         if( state==1 ) {
1772                             if ( bitmap ) {
1773                                 DEBUG_TRIE_EXECUTE_r(
1774                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1775                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1776                                             (char *)uc, utf8_target );
1777                                         PerlIO_printf( Perl_debug_log,
1778                                             " Scanning for legal start char...\n");
1779                                     }
1780                                 );
1781                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1782                                     uc++;
1783                                 }
1784                                 s= (char *)uc;
1785                             }
1786                             if (uc >(U8*)last_start) break;
1787                         }
1788
1789                         if ( word ) {
1790                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1791                             if (!leftmost || lpos < leftmost) {
1792                                 DEBUG_r(accepted_word=word);
1793                                 leftmost= lpos;
1794                             }
1795                             if (base==0) break;
1796
1797                         }
1798                         points[pointpos++ % maxlen]= uc;
1799                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1800                                              uscan, len, uvc, charid, foldlen,
1801                                              foldbuf, uniflags);
1802                         DEBUG_TRIE_EXECUTE_r({
1803                             dump_exec_pos( (char *)uc, c, strend, real_start,
1804                                 s,   utf8_target );
1805                             PerlIO_printf(Perl_debug_log,
1806                                 " Charid:%3u CP:%4"UVxf" ",
1807                                  charid, uvc);
1808                         });
1809
1810                         do {
1811 #ifdef DEBUGGING
1812                             word = aho->states[ state ].wordnum;
1813 #endif
1814                             base = aho->states[ state ].trans.base;
1815
1816                             DEBUG_TRIE_EXECUTE_r({
1817                                 if (failed)
1818                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1819                                         s,   utf8_target );
1820                                 PerlIO_printf( Perl_debug_log,
1821                                     "%sState: %4"UVxf", word=%"UVxf,
1822                                     failed ? " Fail transition to " : "",
1823                                     (UV)state, (UV)word);
1824                             });
1825                             if ( base ) {
1826                                 U32 tmp;
1827                                 I32 offset;
1828                                 if (charid &&
1829                                      ( ((offset = base + charid
1830                                         - 1 - trie->uniquecharcount)) >= 0)
1831                                      && ((U32)offset < trie->lasttrans)
1832                                      && trie->trans[offset].check == state
1833                                      && (tmp=trie->trans[offset].next))
1834                                 {
1835                                     DEBUG_TRIE_EXECUTE_r(
1836                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1837                                     state = tmp;
1838                                     break;
1839                                 }
1840                                 else {
1841                                     DEBUG_TRIE_EXECUTE_r(
1842                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1843                                     failed = 1;
1844                                     state = aho->fail[state];
1845                                 }
1846                             }
1847                             else {
1848                                 /* we must be accepting here */
1849                                 DEBUG_TRIE_EXECUTE_r(
1850                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1851                                 failed = 1;
1852                                 break;
1853                             }
1854                         } while(state);
1855                         uc += len;
1856                         if (failed) {
1857                             if (leftmost)
1858                                 break;
1859                             if (!state) state = 1;
1860                         }
1861                     }
1862                     if ( aho->states[ state ].wordnum ) {
1863                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1864                         if (!leftmost || lpos < leftmost) {
1865                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1866                             leftmost = lpos;
1867                         }
1868                     }
1869                     if (leftmost) {
1870                         s = (char*)leftmost;
1871                         DEBUG_TRIE_EXECUTE_r({
1872                             PerlIO_printf(
1873                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1874                                 (UV)accepted_word, (IV)(s - real_start)
1875                             );
1876                         });
1877                         if (!reginfo || regtry(reginfo, &s)) {
1878                             FREETMPS;
1879                             LEAVE;
1880                             goto got_it;
1881                         }
1882                         s = HOPc(s,1);
1883                         DEBUG_TRIE_EXECUTE_r({
1884                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1885                         });
1886                     } else {
1887                         DEBUG_TRIE_EXECUTE_r(
1888                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1889                         break;
1890                     }
1891                 }
1892                 FREETMPS;
1893                 LEAVE;
1894             }
1895             break;
1896         default:
1897             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1898             break;
1899         }
1900         return 0;
1901       got_it:
1902         return s;
1903 }
1904
1905
1906 /*
1907  - regexec_flags - match a regexp against a string
1908  */
1909 I32
1910 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1911               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1912 /* strend: pointer to null at end of string */
1913 /* strbeg: real beginning of string */
1914 /* minend: end of match must be >=minend after stringarg. */
1915 /* data: May be used for some additional optimizations.
1916          Currently its only used, with a U32 cast, for transmitting
1917          the ganch offset when doing a /g match. This will change */
1918 /* nosave: For optimizations. */
1919 {
1920     dVAR;
1921     struct regexp *const prog = (struct regexp *)SvANY(rx);
1922     /*register*/ char *s;
1923     register regnode *c;
1924     /*register*/ char *startpos = stringarg;
1925     I32 minlen;         /* must match at least this many chars */
1926     I32 dontbother = 0; /* how many characters not to try at end */
1927     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1928     I32 scream_pos = -1;                /* Internal iterator of scream. */
1929     char *scream_olds = NULL;
1930     const bool utf8_target = cBOOL(DO_UTF8(sv));
1931     I32 multiline;
1932     RXi_GET_DECL(prog,progi);
1933     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1934     regexp_paren_pair *swap = NULL;
1935     GET_RE_DEBUG_FLAGS_DECL;
1936
1937     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1938     PERL_UNUSED_ARG(data);
1939
1940     /* Be paranoid... */
1941     if (prog == NULL || startpos == NULL) {
1942         Perl_croak(aTHX_ "NULL regexp parameter");
1943         return 0;
1944     }
1945
1946     multiline = prog->extflags & RXf_PMf_MULTILINE;
1947     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1948
1949     RX_MATCH_UTF8_set(rx, utf8_target);
1950     DEBUG_EXECUTE_r(
1951         debug_start_match(rx, utf8_target, startpos, strend,
1952         "Matching");
1953     );
1954
1955     minlen = prog->minlen;
1956
1957     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1958         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1959                               "String too short [regexec_flags]...\n"));
1960         goto phooey;
1961     }
1962
1963
1964     /* Check validity of program. */
1965     if (UCHARAT(progi->program) != REG_MAGIC) {
1966         Perl_croak(aTHX_ "corrupted regexp program");
1967     }
1968
1969     PL_reg_flags = 0;
1970     PL_reg_eval_set = 0;
1971     PL_reg_maxiter = 0;
1972
1973     if (RX_UTF8(rx))
1974         PL_reg_flags |= RF_utf8;
1975
1976     /* Mark beginning of line for ^ and lookbehind. */
1977     reginfo.bol = startpos; /* XXX not used ??? */
1978     PL_bostr  = strbeg;
1979     reginfo.sv = sv;
1980
1981     /* Mark end of line for $ (and such) */
1982     PL_regeol = strend;
1983
1984     /* see how far we have to get to not match where we matched before */
1985     reginfo.till = startpos+minend;
1986
1987     /* If there is a "must appear" string, look for it. */
1988     s = startpos;
1989
1990     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1991         MAGIC *mg;
1992         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
1993             reginfo.ganch = startpos + prog->gofs;
1994             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1995               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1996         } else if (sv && SvTYPE(sv) >= SVt_PVMG
1997                   && SvMAGIC(sv)
1998                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1999                   && mg->mg_len >= 0) {
2000             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2001             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2002                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2003
2004             if (prog->extflags & RXf_ANCH_GPOS) {
2005                 if (s > reginfo.ganch)
2006                     goto phooey;
2007                 s = reginfo.ganch - prog->gofs;
2008                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2009                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2010                 if (s < strbeg)
2011                     goto phooey;
2012             }
2013         }
2014         else if (data) {
2015             reginfo.ganch = strbeg + PTR2UV(data);
2016             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2017                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2018
2019         } else {                                /* pos() not defined */
2020             reginfo.ganch = strbeg;
2021             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2022                  "GPOS: reginfo.ganch = strbeg\n"));
2023         }
2024     }
2025     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2026         /* We have to be careful. If the previous successful match
2027            was from this regex we don't want a subsequent partially
2028            successful match to clobber the old results.
2029            So when we detect this possibility we add a swap buffer
2030            to the re, and switch the buffer each match. If we fail
2031            we switch it back, otherwise we leave it swapped.
2032         */
2033         swap = prog->offs;
2034         /* do we need a save destructor here for eval dies? */
2035         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2036     }
2037     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2038         re_scream_pos_data d;
2039
2040         d.scream_olds = &scream_olds;
2041         d.scream_pos = &scream_pos;
2042         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2043         if (!s) {
2044             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2045             goto phooey;        /* not present */
2046         }
2047     }
2048
2049
2050
2051     /* Simplest case:  anchored match need be tried only once. */
2052     /*  [unless only anchor is BOL and multiline is set] */
2053     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2054         if (s == startpos && regtry(&reginfo, &startpos))
2055             goto got_it;
2056         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2057                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2058         {
2059             char *end;
2060
2061             if (minlen)
2062                 dontbother = minlen - 1;
2063             end = HOP3c(strend, -dontbother, strbeg) - 1;
2064             /* for multiline we only have to try after newlines */
2065             if (prog->check_substr || prog->check_utf8) {
2066                 /* because of the goto we can not easily reuse the macros for bifurcating the
2067                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2068                 if (utf8_target) {
2069                     if (s == startpos)
2070                         goto after_try_utf8;
2071                     while (1) {
2072                         if (regtry(&reginfo, &s)) {
2073                             goto got_it;
2074                         }
2075                       after_try_utf8:
2076                         if (s > end) {
2077                             goto phooey;
2078                         }
2079                         if (prog->extflags & RXf_USE_INTUIT) {
2080                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2081                             if (!s) {
2082                                 goto phooey;
2083                             }
2084                         }
2085                         else {
2086                             s += UTF8SKIP(s);
2087                         }
2088                     }
2089                 } /* end search for check string in unicode */
2090                 else {
2091                     if (s == startpos) {
2092                         goto after_try_latin;
2093                     }
2094                     while (1) {
2095                         if (regtry(&reginfo, &s)) {
2096                             goto got_it;
2097                         }
2098                       after_try_latin:
2099                         if (s > end) {
2100                             goto phooey;
2101                         }
2102                         if (prog->extflags & RXf_USE_INTUIT) {
2103                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2104                             if (!s) {
2105                                 goto phooey;
2106                             }
2107                         }
2108                         else {
2109                             s++;
2110                         }
2111                     }
2112                 } /* end search for check string in latin*/
2113             } /* end search for check string */
2114             else { /* search for newline */
2115                 if (s > startpos) {
2116                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2117                     s--;
2118                 }
2119                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2120                 while (s < end) {
2121                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2122                         if (regtry(&reginfo, &s))
2123                             goto got_it;
2124                     }
2125                 }
2126             } /* end search for newline */
2127         } /* end anchored/multiline check string search */
2128         goto phooey;
2129     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2130     {
2131         /* the warning about reginfo.ganch being used without intialization
2132            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2133            and we only enter this block when the same bit is set. */
2134         char *tmp_s = reginfo.ganch - prog->gofs;
2135
2136         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2137             goto got_it;
2138         goto phooey;
2139     }
2140
2141     /* Messy cases:  unanchored match. */
2142     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2143         /* we have /x+whatever/ */
2144         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2145         char ch;
2146 #ifdef DEBUGGING
2147         int did_match = 0;
2148 #endif
2149         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2150             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2151         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2152
2153         if (utf8_target) {
2154             REXEC_FBC_SCAN(
2155                 if (*s == ch) {
2156                     DEBUG_EXECUTE_r( did_match = 1 );
2157                     if (regtry(&reginfo, &s)) goto got_it;
2158                     s += UTF8SKIP(s);
2159                     while (s < strend && *s == ch)
2160                         s += UTF8SKIP(s);
2161                 }
2162             );
2163         }
2164         else {
2165             REXEC_FBC_SCAN(
2166                 if (*s == ch) {
2167                     DEBUG_EXECUTE_r( did_match = 1 );
2168                     if (regtry(&reginfo, &s)) goto got_it;
2169                     s++;
2170                     while (s < strend && *s == ch)
2171                         s++;
2172                 }
2173             );
2174         }
2175         DEBUG_EXECUTE_r(if (!did_match)
2176                 PerlIO_printf(Perl_debug_log,
2177                                   "Did not find anchored character...\n")
2178                );
2179     }
2180     else if (prog->anchored_substr != NULL
2181               || prog->anchored_utf8 != NULL
2182               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2183                   && prog->float_max_offset < strend - s)) {
2184         SV *must;
2185         I32 back_max;
2186         I32 back_min;
2187         char *last;
2188         char *last1;            /* Last position checked before */
2189 #ifdef DEBUGGING
2190         int did_match = 0;
2191 #endif
2192         if (prog->anchored_substr || prog->anchored_utf8) {
2193             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2194                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2195             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2196             back_max = back_min = prog->anchored_offset;
2197         } else {
2198             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2199                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2200             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2201             back_max = prog->float_max_offset;
2202             back_min = prog->float_min_offset;
2203         }
2204
2205
2206         if (must == &PL_sv_undef)
2207             /* could not downgrade utf8 check substring, so must fail */
2208             goto phooey;
2209
2210         if (back_min<0) {
2211             last = strend;
2212         } else {
2213             last = HOP3c(strend,        /* Cannot start after this */
2214                   -(I32)(CHR_SVLEN(must)
2215                          - (SvTAIL(must) != 0) + back_min), strbeg);
2216         }
2217         if (s > PL_bostr)
2218             last1 = HOPc(s, -1);
2219         else
2220             last1 = s - 1;      /* bogus */
2221
2222         /* XXXX check_substr already used to find "s", can optimize if
2223            check_substr==must. */
2224         scream_pos = -1;
2225         dontbother = end_shift;
2226         strend = HOPc(strend, -dontbother);
2227         while ( (s <= last) &&
2228                 ((flags & REXEC_SCREAM)
2229                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2230                                     end_shift, &scream_pos, 0))
2231                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2232                                   (unsigned char*)strend, must,
2233                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2234             /* we may be pointing at the wrong string */
2235             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2236                 s = strbeg + (s - SvPVX_const(sv));
2237             DEBUG_EXECUTE_r( did_match = 1 );
2238             if (HOPc(s, -back_max) > last1) {
2239                 last1 = HOPc(s, -back_min);
2240                 s = HOPc(s, -back_max);
2241             }
2242             else {
2243                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2244
2245                 last1 = HOPc(s, -back_min);
2246                 s = t;
2247             }
2248             if (utf8_target) {
2249                 while (s <= last1) {
2250                     if (regtry(&reginfo, &s))
2251                         goto got_it;
2252                     s += UTF8SKIP(s);
2253                 }
2254             }
2255             else {
2256                 while (s <= last1) {
2257                     if (regtry(&reginfo, &s))
2258                         goto got_it;
2259                     s++;
2260                 }
2261             }
2262         }
2263         DEBUG_EXECUTE_r(if (!did_match) {
2264             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2265                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2266             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2267                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2268                                ? "anchored" : "floating"),
2269                 quoted, RE_SV_TAIL(must));
2270         });
2271         goto phooey;
2272     }
2273     else if ( (c = progi->regstclass) ) {
2274         if (minlen) {
2275             const OPCODE op = OP(progi->regstclass);
2276             /* don't bother with what can't match */
2277             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2278                 strend = HOPc(strend, -(minlen - 1));
2279         }
2280         DEBUG_EXECUTE_r({
2281             SV * const prop = sv_newmortal();
2282             regprop(prog, prop, c);
2283             {
2284                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2285                     s,strend-s,60);
2286                 PerlIO_printf(Perl_debug_log,
2287                     "Matching stclass %.*s against %s (%d bytes)\n",
2288                     (int)SvCUR(prop), SvPVX_const(prop),
2289                      quoted, (int)(strend - s));
2290             }
2291         });
2292         if (find_byclass(prog, c, s, strend, &reginfo))
2293             goto got_it;
2294         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2295     }
2296     else {
2297         dontbother = 0;
2298         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2299             /* Trim the end. */
2300             char *last;
2301             SV* float_real;
2302
2303             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2304                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2305             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2306
2307             if (flags & REXEC_SCREAM) {
2308                 last = screaminstr(sv, float_real, s - strbeg,
2309                                    end_shift, &scream_pos, 1); /* last one */
2310                 if (!last)
2311                     last = scream_olds; /* Only one occurrence. */
2312                 /* we may be pointing at the wrong string */
2313                 else if (RXp_MATCH_COPIED(prog))
2314                     s = strbeg + (s - SvPVX_const(sv));
2315             }
2316             else {
2317                 STRLEN len;
2318                 const char * const little = SvPV_const(float_real, len);
2319
2320                 if (SvTAIL(float_real)) {
2321                     if (memEQ(strend - len + 1, little, len - 1))
2322                         last = strend - len + 1;
2323                     else if (!multiline)
2324                         last = memEQ(strend - len, little, len)
2325                             ? strend - len : NULL;
2326                     else
2327                         goto find_last;
2328                 } else {
2329                   find_last:
2330                     if (len)
2331                         last = rninstr(s, strend, little, little + len);
2332                     else
2333                         last = strend;  /* matching "$" */
2334                 }
2335             }
2336             if (last == NULL) {
2337                 DEBUG_EXECUTE_r(
2338                     PerlIO_printf(Perl_debug_log,
2339                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2340                         PL_colors[4], PL_colors[5]));
2341                 goto phooey; /* Should not happen! */
2342             }
2343             dontbother = strend - last + prog->float_min_offset;
2344         }
2345         if (minlen && (dontbother < minlen))
2346             dontbother = minlen - 1;
2347         strend -= dontbother;              /* this one's always in bytes! */
2348         /* We don't know much -- general case. */
2349         if (utf8_target) {
2350             for (;;) {
2351                 if (regtry(&reginfo, &s))
2352                     goto got_it;
2353                 if (s >= strend)
2354                     break;
2355                 s += UTF8SKIP(s);
2356             };
2357         }
2358         else {
2359             do {
2360                 if (regtry(&reginfo, &s))
2361                     goto got_it;
2362             } while (s++ < strend);
2363         }
2364     }
2365
2366     /* Failure. */
2367     goto phooey;
2368
2369 got_it:
2370     Safefree(swap);
2371     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2372
2373     if (PL_reg_eval_set)
2374         restore_pos(aTHX_ prog);
2375     if (RXp_PAREN_NAMES(prog))
2376         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2377
2378     /* make sure $`, $&, $', and $digit will work later */
2379     if ( !(flags & REXEC_NOT_FIRST) ) {
2380         RX_MATCH_COPY_FREE(rx);
2381         if (flags & REXEC_COPY_STR) {
2382             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2383 #ifdef PERL_OLD_COPY_ON_WRITE
2384             if ((SvIsCOW(sv)
2385                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2386                 if (DEBUG_C_TEST) {
2387                     PerlIO_printf(Perl_debug_log,
2388                                   "Copy on write: regexp capture, type %d\n",
2389                                   (int) SvTYPE(sv));
2390                 }
2391                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2392                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2393                 assert (SvPOKp(prog->saved_copy));
2394             } else
2395 #endif
2396             {
2397                 RX_MATCH_COPIED_on(rx);
2398                 s = savepvn(strbeg, i);
2399                 prog->subbeg = s;
2400             }
2401             prog->sublen = i;
2402         }
2403         else {
2404             prog->subbeg = strbeg;
2405             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2406         }
2407     }
2408
2409     return 1;
2410
2411 phooey:
2412     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2413                           PL_colors[4], PL_colors[5]));
2414     if (PL_reg_eval_set)
2415         restore_pos(aTHX_ prog);
2416     if (swap) {
2417         /* we failed :-( roll it back */
2418         Safefree(prog->offs);
2419         prog->offs = swap;
2420     }
2421
2422     return 0;
2423 }
2424
2425
2426 /*
2427  - regtry - try match at specific point
2428  */
2429 STATIC I32                      /* 0 failure, 1 success */
2430 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2431 {
2432     dVAR;
2433     CHECKPOINT lastcp;
2434     REGEXP *const rx = reginfo->prog;
2435     regexp *const prog = (struct regexp *)SvANY(rx);
2436     RXi_GET_DECL(prog,progi);
2437     GET_RE_DEBUG_FLAGS_DECL;
2438
2439     PERL_ARGS_ASSERT_REGTRY;
2440
2441     reginfo->cutpoint=NULL;
2442
2443     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2444         MAGIC *mg;
2445
2446         PL_reg_eval_set = RS_init;
2447         DEBUG_EXECUTE_r(DEBUG_s(
2448             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2449                           (IV)(PL_stack_sp - PL_stack_base));
2450             ));
2451         SAVESTACK_CXPOS();
2452         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2453         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2454         SAVETMPS;
2455         /* Apparently this is not needed, judging by wantarray. */
2456         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2457            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2458
2459         if (reginfo->sv) {
2460             /* Make $_ available to executed code. */
2461             if (reginfo->sv != DEFSV) {
2462                 SAVE_DEFSV;
2463                 DEFSV_set(reginfo->sv);
2464             }
2465
2466             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2467                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2468                 /* prepare for quick setting of pos */
2469 #ifdef PERL_OLD_COPY_ON_WRITE
2470                 if (SvIsCOW(reginfo->sv))
2471                     sv_force_normal_flags(reginfo->sv, 0);
2472 #endif
2473                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2474                                  &PL_vtbl_mglob, NULL, 0);
2475                 mg->mg_len = -1;
2476             }
2477             PL_reg_magic    = mg;
2478             PL_reg_oldpos   = mg->mg_len;
2479             SAVEDESTRUCTOR_X(restore_pos, prog);
2480         }
2481         if (!PL_reg_curpm) {
2482             Newxz(PL_reg_curpm, 1, PMOP);
2483 #ifdef USE_ITHREADS
2484             {
2485                 SV* const repointer = &PL_sv_undef;
2486                 /* this regexp is also owned by the new PL_reg_curpm, which
2487                    will try to free it.  */
2488                 av_push(PL_regex_padav, repointer);
2489                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2490                 PL_regex_pad = AvARRAY(PL_regex_padav);
2491             }
2492 #endif
2493         }
2494 #ifdef USE_ITHREADS
2495         /* It seems that non-ithreads works both with and without this code.
2496            So for efficiency reasons it seems best not to have the code
2497            compiled when it is not needed.  */
2498         /* This is safe against NULLs: */
2499         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2500         /* PM_reg_curpm owns a reference to this regexp.  */
2501         ReREFCNT_inc(rx);
2502 #endif
2503         PM_SETRE(PL_reg_curpm, rx);
2504         PL_reg_oldcurpm = PL_curpm;
2505         PL_curpm = PL_reg_curpm;
2506         if (RXp_MATCH_COPIED(prog)) {
2507             /*  Here is a serious problem: we cannot rewrite subbeg,
2508                 since it may be needed if this match fails.  Thus
2509                 $` inside (?{}) could fail... */
2510             PL_reg_oldsaved = prog->subbeg;
2511             PL_reg_oldsavedlen = prog->sublen;
2512 #ifdef PERL_OLD_COPY_ON_WRITE
2513             PL_nrs = prog->saved_copy;
2514 #endif
2515             RXp_MATCH_COPIED_off(prog);
2516         }
2517         else
2518             PL_reg_oldsaved = NULL;
2519         prog->subbeg = PL_bostr;
2520         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2521     }
2522     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2523     prog->offs[0].start = *startpos - PL_bostr;
2524     PL_reginput = *startpos;
2525     PL_reglastparen = &prog->lastparen;
2526     PL_reglastcloseparen = &prog->lastcloseparen;
2527     prog->lastparen = 0;
2528     prog->lastcloseparen = 0;
2529     PL_regsize = 0;
2530     PL_regoffs = prog->offs;
2531     if (PL_reg_start_tmpl <= prog->nparens) {
2532         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2533         if(PL_reg_start_tmp)
2534             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2535         else
2536             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2537     }
2538
2539     /* XXXX What this code is doing here?!!!  There should be no need
2540        to do this again and again, PL_reglastparen should take care of
2541        this!  --ilya*/
2542
2543     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2544      * Actually, the code in regcppop() (which Ilya may be meaning by
2545      * PL_reglastparen), is not needed at all by the test suite
2546      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2547      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2548      * Meanwhile, this code *is* needed for the
2549      * above-mentioned test suite tests to succeed.  The common theme
2550      * on those tests seems to be returning null fields from matches.
2551      * --jhi updated by dapm */
2552 #if 1
2553     if (prog->nparens) {
2554         regexp_paren_pair *pp = PL_regoffs;
2555         register I32 i;
2556         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2557             ++pp;
2558             pp->start = -1;
2559             pp->end = -1;
2560         }
2561     }
2562 #endif
2563     REGCP_SET(lastcp);
2564     if (regmatch(reginfo, progi->program + 1)) {
2565         PL_regoffs[0].end = PL_reginput - PL_bostr;
2566         return 1;
2567     }
2568     if (reginfo->cutpoint)
2569         *startpos= reginfo->cutpoint;
2570     REGCP_UNWIND(lastcp);
2571     return 0;
2572 }
2573
2574
2575 #define sayYES goto yes
2576 #define sayNO goto no
2577 #define sayNO_SILENT goto no_silent
2578
2579 /* we dont use STMT_START/END here because it leads to
2580    "unreachable code" warnings, which are bogus, but distracting. */
2581 #define CACHEsayNO \
2582     if (ST.cache_mask) \
2583        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2584     sayNO
2585
2586 /* this is used to determine how far from the left messages like
2587    'failed...' are printed. It should be set such that messages
2588    are inline with the regop output that created them.
2589 */
2590 #define REPORT_CODE_OFF 32
2591
2592
2593 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2594 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2595
2596 #define SLAB_FIRST(s) (&(s)->states[0])
2597 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2598
2599 /* grab a new slab and return the first slot in it */
2600
2601 STATIC regmatch_state *
2602 S_push_slab(pTHX)
2603 {
2604 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2605     dMY_CXT;
2606 #endif
2607     regmatch_slab *s = PL_regmatch_slab->next;
2608     if (!s) {
2609         Newx(s, 1, regmatch_slab);
2610         s->prev = PL_regmatch_slab;
2611         s->next = NULL;
2612         PL_regmatch_slab->next = s;
2613     }
2614     PL_regmatch_slab = s;
2615     return SLAB_FIRST(s);
2616 }
2617
2618
2619 /* push a new state then goto it */
2620
2621 #define PUSH_STATE_GOTO(state, node) \
2622     scan = node; \
2623     st->resume_state = state; \
2624     goto push_state;
2625
2626 /* push a new state with success backtracking, then goto it */
2627
2628 #define PUSH_YES_STATE_GOTO(state, node) \
2629     scan = node; \
2630     st->resume_state = state; \
2631     goto push_yes_state;
2632
2633
2634
2635 /*
2636
2637 regmatch() - main matching routine
2638
2639 This is basically one big switch statement in a loop. We execute an op,
2640 set 'next' to point the next op, and continue. If we come to a point which
2641 we may need to backtrack to on failure such as (A|B|C), we push a
2642 backtrack state onto the backtrack stack. On failure, we pop the top
2643 state, and re-enter the loop at the state indicated. If there are no more
2644 states to pop, we return failure.
2645
2646 Sometimes we also need to backtrack on success; for example /A+/, where
2647 after successfully matching one A, we need to go back and try to
2648 match another one; similarly for lookahead assertions: if the assertion
2649 completes successfully, we backtrack to the state just before the assertion
2650 and then carry on.  In these cases, the pushed state is marked as
2651 'backtrack on success too'. This marking is in fact done by a chain of
2652 pointers, each pointing to the previous 'yes' state. On success, we pop to
2653 the nearest yes state, discarding any intermediate failure-only states.
2654 Sometimes a yes state is pushed just to force some cleanup code to be
2655 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2656 it to free the inner regex.
2657
2658 Note that failure backtracking rewinds the cursor position, while
2659 success backtracking leaves it alone.
2660
2661 A pattern is complete when the END op is executed, while a subpattern
2662 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2663 ops trigger the "pop to last yes state if any, otherwise return true"
2664 behaviour.
2665
2666 A common convention in this function is to use A and B to refer to the two
2667 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2668 the subpattern to be matched possibly multiple times, while B is the entire
2669 rest of the pattern. Variable and state names reflect this convention.
2670
2671 The states in the main switch are the union of ops and failure/success of
2672 substates associated with with that op.  For example, IFMATCH is the op
2673 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2674 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2675 successfully matched A and IFMATCH_A_fail is a state saying that we have
2676 just failed to match A. Resume states always come in pairs. The backtrack
2677 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2678 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2679 on success or failure.
2680
2681 The struct that holds a backtracking state is actually a big union, with
2682 one variant for each major type of op. The variable st points to the
2683 top-most backtrack struct. To make the code clearer, within each
2684 block of code we #define ST to alias the relevant union.
2685
2686 Here's a concrete example of a (vastly oversimplified) IFMATCH
2687 implementation:
2688
2689     switch (state) {
2690     ....
2691
2692 #define ST st->u.ifmatch
2693
2694     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2695         ST.foo = ...; // some state we wish to save
2696         ...
2697         // push a yes backtrack state with a resume value of
2698         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2699         // first node of A:
2700         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2701         // NOTREACHED
2702
2703     case IFMATCH_A: // we have successfully executed A; now continue with B
2704         next = B;
2705         bar = ST.foo; // do something with the preserved value
2706         break;
2707
2708     case IFMATCH_A_fail: // A failed, so the assertion failed
2709         ...;   // do some housekeeping, then ...
2710         sayNO; // propagate the failure
2711
2712 #undef ST
2713
2714     ...
2715     }
2716
2717 For any old-timers reading this who are familiar with the old recursive
2718 approach, the code above is equivalent to:
2719
2720     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2721     {
2722         int foo = ...
2723         ...
2724         if (regmatch(A)) {
2725             next = B;
2726             bar = foo;
2727             break;
2728         }
2729         ...;   // do some housekeeping, then ...
2730         sayNO; // propagate the failure
2731     }
2732
2733 The topmost backtrack state, pointed to by st, is usually free. If you
2734 want to claim it, populate any ST.foo fields in it with values you wish to
2735 save, then do one of
2736
2737         PUSH_STATE_GOTO(resume_state, node);
2738         PUSH_YES_STATE_GOTO(resume_state, node);
2739
2740 which sets that backtrack state's resume value to 'resume_state', pushes a
2741 new free entry to the top of the backtrack stack, then goes to 'node'.
2742 On backtracking, the free slot is popped, and the saved state becomes the
2743 new free state. An ST.foo field in this new top state can be temporarily
2744 accessed to retrieve values, but once the main loop is re-entered, it
2745 becomes available for reuse.
2746
2747 Note that the depth of the backtrack stack constantly increases during the
2748 left-to-right execution of the pattern, rather than going up and down with
2749 the pattern nesting. For example the stack is at its maximum at Z at the
2750 end of the pattern, rather than at X in the following:
2751
2752     /(((X)+)+)+....(Y)+....Z/
2753
2754 The only exceptions to this are lookahead/behind assertions and the cut,
2755 (?>A), which pop all the backtrack states associated with A before
2756 continuing.
2757
2758 Bascktrack state structs are allocated in slabs of about 4K in size.
2759 PL_regmatch_state and st always point to the currently active state,
2760 and PL_regmatch_slab points to the slab currently containing
2761 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2762 allocated, and is never freed until interpreter destruction. When the slab
2763 is full, a new one is allocated and chained to the end. At exit from
2764 regmatch(), slabs allocated since entry are freed.
2765
2766 */
2767
2768
2769 #define DEBUG_STATE_pp(pp)                                  \
2770     DEBUG_STATE_r({                                         \
2771         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2772         PerlIO_printf(Perl_debug_log,                       \
2773             "    %*s"pp" %s%s%s%s%s\n",                     \
2774             depth*2, "",                                    \
2775             PL_reg_name[st->resume_state],                     \
2776             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2777             ((st==yes_state) ? "Y" : ""),                   \
2778             ((st==mark_state) ? "M" : ""),                  \
2779             ((st==yes_state||st==mark_state) ? "]" : "")    \
2780         );                                                  \
2781     });
2782
2783
2784 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2785
2786 #ifdef DEBUGGING
2787
2788 STATIC void
2789 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2790     const char *start, const char *end, const char *blurb)
2791 {
2792     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2793
2794     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2795
2796     if (!PL_colorset)
2797             reginitcolors();
2798     {
2799         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2800             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2801
2802         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2803             start, end - start, 60);
2804
2805         PerlIO_printf(Perl_debug_log,
2806             "%s%s REx%s %s against %s\n",
2807                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2808
2809         if (utf8_target||utf8_pat)
2810             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2811                 utf8_pat ? "pattern" : "",
2812                 utf8_pat && utf8_target ? " and " : "",
2813                 utf8_target ? "string" : ""
2814             );
2815     }
2816 }
2817
2818 STATIC void
2819 S_dump_exec_pos(pTHX_ const char *locinput,
2820                       const regnode *scan,
2821                       const char *loc_regeol,
2822                       const char *loc_bostr,
2823                       const char *loc_reg_starttry,
2824                       const bool utf8_target)
2825 {
2826     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2827     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2828     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2829     /* The part of the string before starttry has one color
2830        (pref0_len chars), between starttry and current
2831        position another one (pref_len - pref0_len chars),
2832        after the current position the third one.
2833        We assume that pref0_len <= pref_len, otherwise we
2834        decrease pref0_len.  */
2835     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2836         ? (5 + taill) - l : locinput - loc_bostr;
2837     int pref0_len;
2838
2839     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2840
2841     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2842         pref_len++;
2843     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2844     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2845         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2846               ? (5 + taill) - pref_len : loc_regeol - locinput);
2847     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2848         l--;
2849     if (pref0_len < 0)
2850         pref0_len = 0;
2851     if (pref0_len > pref_len)
2852         pref0_len = pref_len;
2853     {
2854         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2855
2856         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2857             (locinput - pref_len),pref0_len, 60, 4, 5);
2858
2859         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2860                     (locinput - pref_len + pref0_len),
2861                     pref_len - pref0_len, 60, 2, 3);
2862
2863         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2864                     locinput, loc_regeol - locinput, 10, 0, 1);
2865
2866         const STRLEN tlen=len0+len1+len2;
2867         PerlIO_printf(Perl_debug_log,
2868                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2869                     (IV)(locinput - loc_bostr),
2870                     len0, s0,
2871                     len1, s1,
2872                     (docolor ? "" : "> <"),
2873                     len2, s2,
2874                     (int)(tlen > 19 ? 0 :  19 - tlen),
2875                     "");
2876     }
2877 }
2878
2879 #endif
2880
2881 /* reg_check_named_buff_matched()
2882  * Checks to see if a named buffer has matched. The data array of
2883  * buffer numbers corresponding to the buffer is expected to reside
2884  * in the regexp->data->data array in the slot stored in the ARG() of
2885  * node involved. Note that this routine doesn't actually care about the
2886  * name, that information is not preserved from compilation to execution.
2887  * Returns the index of the leftmost defined buffer with the given name
2888  * or 0 if non of the buffers matched.
2889  */
2890 STATIC I32
2891 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2892 {
2893     I32 n;
2894     RXi_GET_DECL(rex,rexi);
2895     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2896     I32 *nums=(I32*)SvPVX(sv_dat);
2897
2898     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2899
2900     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2901         if ((I32)*PL_reglastparen >= nums[n] &&
2902             PL_regoffs[nums[n]].end != -1)
2903         {
2904             return nums[n];
2905         }
2906     }
2907     return 0;
2908 }
2909
2910
2911 /* free all slabs above current one  - called during LEAVE_SCOPE */
2912
2913 STATIC void
2914 S_clear_backtrack_stack(pTHX_ void *p)
2915 {
2916     regmatch_slab *s = PL_regmatch_slab->next;
2917     PERL_UNUSED_ARG(p);
2918
2919     if (!s)
2920         return;
2921     PL_regmatch_slab->next = NULL;
2922     while (s) {
2923         regmatch_slab * const osl = s;
2924         s = s->next;
2925         Safefree(osl);
2926     }
2927 }
2928
2929
2930 #define SETREX(Re1,Re2) \
2931     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2932     Re1 = (Re2)
2933
2934 STATIC I32                      /* 0 failure, 1 success */
2935 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2936 {
2937 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2938     dMY_CXT;
2939 #endif
2940     dVAR;
2941     register const bool utf8_target = PL_reg_match_utf8;
2942     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2943     REGEXP *rex_sv = reginfo->prog;
2944     regexp *rex = (struct regexp *)SvANY(rex_sv);
2945     RXi_GET_DECL(rex,rexi);
2946     I32 oldsave;
2947     /* the current state. This is a cached copy of PL_regmatch_state */
2948     register regmatch_state *st;
2949     /* cache heavy used fields of st in registers */
2950     register regnode *scan;
2951     register regnode *next;
2952     register U32 n = 0; /* general value; init to avoid compiler warning */
2953     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2954     register char *locinput = PL_reginput;
2955     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2956
2957     bool result = 0;        /* return value of S_regmatch */
2958     int depth = 0;          /* depth of backtrack stack */
2959     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2960     const U32 max_nochange_depth =
2961         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2962         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2963     regmatch_state *yes_state = NULL; /* state to pop to on success of
2964                                                             subpattern */
2965     /* mark_state piggy backs on the yes_state logic so that when we unwind
2966        the stack on success we can update the mark_state as we go */
2967     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2968     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2969     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2970     U32 state_num;
2971     bool no_final = 0;      /* prevent failure from backtracking? */
2972     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2973     char *startpoint = PL_reginput;
2974     SV *popmark = NULL;     /* are we looking for a mark? */
2975     SV *sv_commit = NULL;   /* last mark name seen in failure */
2976     SV *sv_yes_mark = NULL; /* last mark name we have seen
2977                                during a successfull match */
2978     U32 lastopen = 0;       /* last open we saw */
2979     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2980     SV* const oreplsv = GvSV(PL_replgv);
2981     /* these three flags are set by various ops to signal information to
2982      * the very next op. They have a useful lifetime of exactly one loop
2983      * iteration, and are not preserved or restored by state pushes/pops
2984      */
2985     bool sw = 0;            /* the condition value in (?(cond)a|b) */
2986     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
2987     int logical = 0;        /* the following EVAL is:
2988                                 0: (?{...})
2989                                 1: (?(?{...})X|Y)
2990                                 2: (??{...})
2991                                or the following IFMATCH/UNLESSM is:
2992                                 false: plain (?=foo)
2993                                 true:  used as a condition: (?(?=foo))
2994                             */
2995 #ifdef DEBUGGING
2996     GET_RE_DEBUG_FLAGS_DECL;
2997 #endif
2998
2999     PERL_ARGS_ASSERT_REGMATCH;
3000
3001     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3002             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3003     }));
3004     /* on first ever call to regmatch, allocate first slab */
3005     if (!PL_regmatch_slab) {
3006         Newx(PL_regmatch_slab, 1, regmatch_slab);
3007         PL_regmatch_slab->prev = NULL;
3008         PL_regmatch_slab->next = NULL;
3009         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3010     }
3011
3012     oldsave = PL_savestack_ix;
3013     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3014     SAVEVPTR(PL_regmatch_slab);
3015     SAVEVPTR(PL_regmatch_state);
3016
3017     /* grab next free state slot */
3018     st = ++PL_regmatch_state;
3019     if (st >  SLAB_LAST(PL_regmatch_slab))
3020         st = PL_regmatch_state = S_push_slab(aTHX);
3021
3022     /* Note that nextchr is a byte even in UTF */
3023     nextchr = UCHARAT(locinput);
3024     scan = prog;
3025     while (scan != NULL) {
3026
3027         DEBUG_EXECUTE_r( {
3028             SV * const prop = sv_newmortal();
3029             regnode *rnext=regnext(scan);
3030             DUMP_EXEC_POS( locinput, scan, utf8_target );
3031             regprop(rex, prop, scan);
3032
3033             PerlIO_printf(Perl_debug_log,
3034                     "%3"IVdf":%*s%s(%"IVdf")\n",
3035                     (IV)(scan - rexi->program), depth*2, "",
3036                     SvPVX_const(prop),
3037                     (PL_regkind[OP(scan)] == END || !rnext) ?
3038                         0 : (IV)(rnext - rexi->program));
3039         });
3040
3041         next = scan + NEXT_OFF(scan);
3042         if (next == scan)
3043             next = NULL;
3044         state_num = OP(scan);
3045
3046         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3047       reenter_switch:
3048
3049         assert(PL_reglastparen == &rex->lastparen);
3050         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3051         assert(PL_regoffs == rex->offs);
3052
3053         switch (state_num) {
3054         case BOL:
3055             if (locinput == PL_bostr)
3056             {
3057                 /* reginfo->till = reginfo->bol; */
3058                 break;
3059             }
3060             sayNO;
3061         case MBOL:
3062             if (locinput == PL_bostr ||
3063                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3064             {
3065                 break;
3066             }
3067             sayNO;
3068         case SBOL:
3069             if (locinput == PL_bostr)
3070                 break;
3071             sayNO;
3072         case GPOS:
3073             if (locinput == reginfo->ganch)
3074                 break;
3075             sayNO;
3076
3077         case KEEPS:
3078             /* update the startpoint */
3079             st->u.keeper.val = PL_regoffs[0].start;
3080             PL_reginput = locinput;
3081             PL_regoffs[0].start = locinput - PL_bostr;
3082             PUSH_STATE_GOTO(KEEPS_next, next);
3083             /*NOT-REACHED*/
3084         case KEEPS_next_fail:
3085             /* rollback the start point change */
3086             PL_regoffs[0].start = st->u.keeper.val;
3087             sayNO_SILENT;
3088             /*NOT-REACHED*/
3089         case EOL:
3090                 goto seol;
3091         case MEOL:
3092             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3093                 sayNO;
3094             break;
3095         case SEOL:
3096           seol:
3097             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3098                 sayNO;
3099             if (PL_regeol - locinput > 1)
3100                 sayNO;
3101             break;
3102         case EOS:
3103             if (PL_regeol != locinput)
3104                 sayNO;
3105             break;
3106         case SANY:
3107             if (!nextchr && locinput >= PL_regeol)
3108                 sayNO;
3109             if (utf8_target) {
3110                 locinput += PL_utf8skip[nextchr];
3111                 if (locinput > PL_regeol)
3112                     sayNO;
3113                 nextchr = UCHARAT(locinput);
3114             }
3115             else
3116                 nextchr = UCHARAT(++locinput);
3117             break;
3118         case CANY:
3119             if (!nextchr && locinput >= PL_regeol)
3120                 sayNO;
3121             nextchr = UCHARAT(++locinput);
3122             break;
3123         case REG_ANY:
3124             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3125                 sayNO;
3126             if (utf8_target) {
3127                 locinput += PL_utf8skip[nextchr];
3128                 if (locinput > PL_regeol)
3129                     sayNO;
3130                 nextchr = UCHARAT(locinput);
3131             }
3132             else
3133                 nextchr = UCHARAT(++locinput);
3134             break;
3135
3136 #undef  ST
3137 #define ST st->u.trie
3138         case TRIEC:
3139             /* In this case the charclass data is available inline so
3140                we can fail fast without a lot of extra overhead.
3141              */
3142             if (scan->flags == EXACT || !utf8_target) {
3143                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3144                     DEBUG_EXECUTE_r(
3145                         PerlIO_printf(Perl_debug_log,
3146                                   "%*s  %sfailed to match trie start class...%s\n",
3147                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3148                     );
3149                     sayNO_SILENT;
3150                     /* NOTREACHED */
3151                 }
3152             }
3153             /* FALL THROUGH */
3154         case TRIE:
3155             /* the basic plan of execution of the trie is:
3156              * At the beginning, run though all the states, and
3157              * find the longest-matching word. Also remember the position
3158              * of the shortest matching word. For example, this pattern:
3159              *    1  2 3 4    5
3160              *    ab|a|x|abcd|abc
3161              * when matched against the string "abcde", will generate
3162              * accept states for all words except 3, with the longest
3163              * matching word being 4, and the shortest being 1 (with
3164              * the position being after char 1 of the string).
3165              *
3166              * Then for each matching word, in word order (i.e. 1,2,4,5),
3167              * we run the remainder of the pattern; on each try setting
3168              * the current position to the character following the word,
3169              * returning to try the next word on failure.
3170              *
3171              * We avoid having to build a list of words at runtime by
3172              * using a compile-time structure, wordinfo[].prev, which
3173              * gives, for each word, the previous accepting word (if any).
3174              * In the case above it would contain the mappings 1->2, 2->0,
3175              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3176              * the longest word (4 above), a list of all words, by
3177              * following the list of prev pointers; this gives us the
3178              * unordered list 4,5,1,2. Then given the current word we have
3179              * just tried, we can go through the list and find the
3180              * next-biggest word to try (so if we just failed on word 2,
3181              * the next in the list is 4).
3182              *
3183              * Since at runtime we don't record the matching position in
3184              * the string for each word, we have to work that out for
3185              * each word we're about to process. The wordinfo table holds
3186              * the character length of each word; given that we recorded
3187              * at the start: the position of the shortest word and its
3188              * length in chars, we just need to move the pointer the
3189              * difference between the two char lengths. Depending on
3190              * Unicode status and folding, that's cheap or expensive.
3191              *
3192              * This algorithm is optimised for the case where are only a
3193              * small number of accept states, i.e. 0,1, or maybe 2.
3194              * With lots of accepts states, and having to try all of them,
3195              * it becomes quadratic on number of accept states to find all
3196              * the next words.
3197              */
3198
3199             {
3200                 /* what type of TRIE am I? (utf8 makes this contextual) */
3201                 DECL_TRIE_TYPE(scan);
3202
3203                 /* what trie are we using right now */
3204                 reg_trie_data * const trie
3205                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3206                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3207                 U32 state = trie->startstate;
3208
3209                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3210                     !TRIE_BITMAP_TEST(trie,*locinput)
3211                 ) {
3212                     if (trie->states[ state ].wordnum) {
3213                          DEBUG_EXECUTE_r(
3214                             PerlIO_printf(Perl_debug_log,
3215                                           "%*s  %smatched empty string...%s\n",
3216                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3217                         );
3218                         break;
3219                     } else {
3220                         DEBUG_EXECUTE_r(
3221                             PerlIO_printf(Perl_debug_log,
3222                                           "%*s  %sfailed to match trie start class...%s\n",
3223                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3224                         );
3225                         sayNO_SILENT;
3226                    }
3227                 }
3228
3229             {
3230                 U8 *uc = ( U8* )locinput;
3231
3232                 STRLEN len = 0;
3233                 STRLEN foldlen = 0;
3234                 U8 *uscan = (U8*)NULL;
3235                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3236                 U32 charcount = 0; /* how many input chars we have matched */
3237                 U32 accepted = 0; /* have we seen any accepting states? */
3238
3239                 ST.B = next;
3240                 ST.jump = trie->jump;
3241                 ST.me = scan;
3242                 ST.firstpos = NULL;
3243                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3244                 ST.nextword = 0;
3245
3246                 /* fully traverse the TRIE; note the position of the
3247                    shortest accept state and the wordnum of the longest
3248                    accept state */
3249
3250                 while ( state && uc <= (U8*)PL_regeol ) {
3251                     U32 base = trie->states[ state ].trans.base;
3252                     UV uvc = 0;
3253                     U16 charid = 0;
3254                     U16 wordnum;
3255                     wordnum = trie->states[ state ].wordnum;
3256
3257                     if (wordnum) { /* it's an accept state */
3258                         if (!accepted) {
3259                             accepted = 1;
3260                             /* record first match position */
3261                             if (ST.longfold) {
3262                                 ST.firstpos = (U8*)locinput;
3263                                 ST.firstchars = 0;
3264                             }
3265                             else {
3266                                 ST.firstpos = uc;
3267                                 ST.firstchars = charcount;
3268                             }
3269                         }
3270                         if (!ST.nextword || wordnum < ST.nextword)
3271                             ST.nextword = wordnum;
3272                         ST.topword = wordnum;
3273                     }
3274
3275                     DEBUG_TRIE_EXECUTE_r({
3276                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3277                                 PerlIO_printf( Perl_debug_log,
3278                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3279                                     2+depth * 2, "", PL_colors[4],
3280                                     (UV)state, (accepted ? 'Y' : 'N'));
3281                     });
3282
3283                     /* read a char and goto next state */
3284                     if ( base ) {
3285                         I32 offset;
3286                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3287                                              uscan, len, uvc, charid, foldlen,
3288                                              foldbuf, uniflags);
3289                         charcount++;
3290                         if (foldlen>0)
3291                             ST.longfold = TRUE;
3292                         if (charid &&
3293                              ( ((offset =
3294                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3295
3296                              && ((U32)offset < trie->lasttrans)
3297                              && trie->trans[offset].check == state)
3298                         {
3299                             state = trie->trans[offset].next;
3300                         }
3301                         else {
3302                             state = 0;
3303                         }
3304                         uc += len;
3305
3306                     }
3307                     else {
3308                         state = 0;
3309                     }
3310                     DEBUG_TRIE_EXECUTE_r(
3311                         PerlIO_printf( Perl_debug_log,
3312                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3313                             charid, uvc, (UV)state, PL_colors[5] );
3314                     );
3315                 }
3316                 if (!accepted)
3317                    sayNO;
3318
3319                 /* calculate total number of accept states */
3320                 {
3321                     U16 w = ST.topword;
3322                     accepted = 0;
3323                     while (w) {
3324                         w = trie->wordinfo[w].prev;
3325                         accepted++;
3326                     }
3327                     ST.accepted = accepted;
3328                 }
3329
3330                 DEBUG_EXECUTE_r(
3331                     PerlIO_printf( Perl_debug_log,
3332                         "%*s  %sgot %"IVdf" possible matches%s\n",
3333                         REPORT_CODE_OFF + depth * 2, "",
3334                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3335                 );
3336                 goto trie_first_try; /* jump into the fail handler */
3337             }}
3338             /* NOTREACHED */
3339
3340         case TRIE_next_fail: /* we failed - try next alternative */
3341             if ( ST.jump) {
3342                 REGCP_UNWIND(ST.cp);
3343                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3344                     PL_regoffs[n].end = -1;
3345                 *PL_reglastparen = n;
3346             }
3347             if (!--ST.accepted) {
3348                 DEBUG_EXECUTE_r({
3349                     PerlIO_printf( Perl_debug_log,
3350                         "%*s  %sTRIE failed...%s\n",
3351                         REPORT_CODE_OFF+depth*2, "",
3352                         PL_colors[4],
3353                         PL_colors[5] );
3354                 });
3355                 sayNO_SILENT;
3356             }
3357             {
3358                 /* Find next-highest word to process.  Note that this code
3359                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3360                 register U16 min = 0;
3361                 register U16 word;
3362                 register U16 const nextword = ST.nextword;
3363                 register reg_trie_wordinfo * const wordinfo
3364                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3365                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3366                     if (word > nextword && (!min || word < min))
3367                         min = word;
3368                 }
3369                 ST.nextword = min;
3370             }
3371
3372           trie_first_try:
3373             if (do_cutgroup) {
3374                 do_cutgroup = 0;
3375                 no_final = 0;
3376             }
3377
3378             if ( ST.jump) {
3379                 ST.lastparen = *PL_reglastparen;
3380                 REGCP_SET(ST.cp);
3381             }
3382
3383             /* find start char of end of current word */
3384             {
3385                 U32 chars; /* how many chars to skip */
3386                 U8 *uc = ST.firstpos;
3387                 reg_trie_data * const trie
3388                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3389
3390                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3391                             >=  ST.firstchars);
3392                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3393                             - ST.firstchars;
3394
3395                 if (ST.longfold) {
3396                     /* the hard option - fold each char in turn and find
3397                      * its folded length (which may be different */
3398                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3399                     STRLEN foldlen;
3400                     STRLEN len;
3401                     UV uvc;
3402                     U8 *uscan;
3403
3404                     while (chars) {
3405                         if (utf8_target) {
3406                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3407                                                     uniflags);
3408                             uc += len;
3409                         }
3410                         else {
3411                             uvc = *uc;
3412                             uc++;
3413                         }
3414                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3415                         uscan = foldbuf;
3416                         while (foldlen) {
3417                             if (!--chars)
3418                                 break;
3419                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3420                                             uniflags);
3421                             uscan += len;
3422                             foldlen -= len;
3423                         }
3424                     }
3425                 }
3426                 else {
3427                     if (utf8_target)
3428                         while (chars--)
3429                             uc += UTF8SKIP(uc);
3430                     else
3431                         uc += chars;
3432                 }
3433                 PL_reginput = (char *)uc;
3434             }
3435
3436             scan = (ST.jump && ST.jump[ST.nextword])
3437                         ? ST.me + ST.jump[ST.nextword]
3438                         : ST.B;
3439
3440             DEBUG_EXECUTE_r({
3441                 PerlIO_printf( Perl_debug_log,
3442                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3443                     REPORT_CODE_OFF+depth*2, "",
3444                     PL_colors[4],
3445                     ST.nextword,
3446                     PL_colors[5]
3447                     );
3448             });
3449
3450             if (ST.accepted > 1 || has_cutgroup) {
3451                 PUSH_STATE_GOTO(TRIE_next, scan);
3452                 /* NOTREACHED */
3453             }
3454             /* only one choice left - just continue */
3455             DEBUG_EXECUTE_r({
3456                 AV *const trie_words
3457                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3458                 SV ** const tmp = av_fetch( trie_words,
3459                     ST.nextword-1, 0 );
3460                 SV *sv= tmp ? sv_newmortal() : NULL;
3461
3462                 PerlIO_printf( Perl_debug_log,
3463                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3464                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3465                     ST.nextword,
3466                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3467                             PL_colors[0], PL_colors[1],
3468                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3469                         )
3470                     : "not compiled under -Dr",
3471                     PL_colors[5] );
3472             });
3473
3474             locinput = PL_reginput;
3475             nextchr = UCHARAT(locinput);
3476             continue; /* execute rest of RE */
3477             /* NOTREACHED */
3478 #undef  ST
3479
3480         case EXACT: {
3481             char *s = STRING(scan);
3482             ln = STR_LEN(scan);
3483             if (utf8_target != UTF_PATTERN) {
3484                 /* The target and the pattern have differing utf8ness. */
3485                 char *l = locinput;
3486                 const char * const e = s + ln;
3487
3488                 if (utf8_target) {
3489                     /* The target is utf8, the pattern is not utf8. */
3490                     while (s < e) {
3491                         STRLEN ulen;
3492                         if (l >= PL_regeol)
3493                              sayNO;
3494                         if (NATIVE_TO_UNI(*(U8*)s) !=
3495                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3496                                             uniflags))
3497                              sayNO;
3498                         l += ulen;
3499                         s ++;
3500                     }
3501                 }
3502                 else {
3503                     /* The target is not utf8, the pattern is utf8. */
3504                     while (s < e) {
3505                         STRLEN ulen;
3506                         if (l >= PL_regeol)
3507                             sayNO;
3508                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3509                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3510                                            uniflags))
3511                             sayNO;
3512                         s += ulen;
3513                         l ++;
3514                     }
3515                 }
3516                 locinput = l;
3517                 nextchr = UCHARAT(locinput);
3518                 break;
3519             }
3520             /* The target and the pattern have the same utf8ness. */
3521             /* Inline the first character, for speed. */
3522             if (UCHARAT(s) != nextchr)
3523                 sayNO;
3524             if (PL_regeol - locinput < ln)
3525                 sayNO;
3526             if (ln > 1 && memNE(s, locinput, ln))
3527                 sayNO;
3528             locinput += ln;
3529             nextchr = UCHARAT(locinput);
3530             break;
3531             }
3532         case EXACTFL:
3533             PL_reg_flags |= RF_tainted;
3534             /* FALL THROUGH */
3535         case EXACTF: {
3536             char * const s = STRING(scan);
3537             ln = STR_LEN(scan);
3538
3539             if (utf8_target || UTF_PATTERN) {
3540               /* Either target or the pattern are utf8. */
3541                 const char * const l = locinput;
3542                 char *e = PL_regeol;
3543
3544                 if (! foldEQ_utf8(s, 0,  ln, cBOOL(UTF_PATTERN),
3545                                l, &e, 0,  utf8_target)) {
3546                      /* One more case for the sharp s:
3547                       * pack("U0U*", 0xDF) =~ /ss/i,
3548                       * the 0xC3 0x9F are the UTF-8
3549                       * byte sequence for the U+00DF. */
3550
3551                      if (!(utf8_target &&
3552                            toLOWER(s[0]) == 's' &&
3553                            ln >= 2 &&
3554                            toLOWER(s[1]) == 's' &&
3555                            (U8)l[0] == 0xC3 &&
3556                            e - l >= 2 &&
3557                            (U8)l[1] == 0x9F))
3558                           sayNO;
3559                 }
3560                 locinput = e;
3561                 nextchr = UCHARAT(locinput);
3562                 break;
3563             }
3564
3565             /* Neither the target and the pattern are utf8. */
3566
3567             /* Inline the first character, for speed. */
3568             if (UCHARAT(s) != nextchr &&
3569                 UCHARAT(s) != ((OP(scan) == EXACTF)
3570                                ? PL_fold : PL_fold_locale)[nextchr])
3571                 sayNO;
3572             if (PL_regeol - locinput < ln)
3573                 sayNO;
3574             if (ln > 1 && (OP(scan) == EXACTF
3575                            ? ! foldEQ(s, locinput, ln)
3576                            : ! foldEQ_locale(s, locinput, ln)))
3577                 sayNO;
3578             locinput += ln;
3579             nextchr = UCHARAT(locinput);
3580             break;
3581             }
3582         case BOUNDL:
3583         case NBOUNDL:
3584             PL_reg_flags |= RF_tainted;
3585             /* FALL THROUGH */
3586         case BOUND:
3587         case NBOUND:
3588             /* was last char in word? */
3589             if (utf8_target) {
3590                 if (locinput == PL_bostr)
3591                     ln = '\n';
3592                 else {
3593                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3594
3595                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3596                 }
3597                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3598                     ln = isALNUM_uni(ln);
3599                     LOAD_UTF8_CHARCLASS_ALNUM();
3600                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3601                 }
3602                 else {
3603                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3604                     n = isALNUM_LC_utf8((U8*)locinput);
3605                 }
3606             }
3607             else {
3608                 ln = (locinput != PL_bostr) ?
3609                     UCHARAT(locinput - 1) : '\n';
3610                 if (FLAGS(scan) & USE_UNI) {
3611
3612                     /* Here, can't be BOUNDL or NBOUNDL because they never set
3613                      * the flags to USE_UNI */
3614                     ln = isWORDCHAR_L1(ln);
3615                     n = isWORDCHAR_L1(nextchr);
3616                 }
3617                 else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3618                     ln = isALNUM(ln);
3619                     n = isALNUM(nextchr);
3620                 }
3621                 else {
3622                     ln = isALNUM_LC(ln);
3623                     n = isALNUM_LC(nextchr);
3624                 }
3625             }
3626             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3627                                     OP(scan) == BOUNDL))
3628                     sayNO;
3629             break;
3630         case ANYOF:
3631             if (utf8_target) {
3632                 STRLEN inclasslen = PL_regeol - locinput;
3633
3634                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3635                     goto anyof_fail;
3636                 if (locinput >= PL_regeol)
3637                     sayNO;
3638                 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3639                 nextchr = UCHARAT(locinput);
3640                 break;
3641             }
3642             else {
3643                 if (nextchr < 0)
3644                     nextchr = UCHARAT(locinput);
3645                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3646                     goto anyof_fail;
3647                 if (!nextchr && locinput >= PL_regeol)
3648                     sayNO;
3649                 nextchr = UCHARAT(++locinput);
3650                 break;
3651             }
3652         anyof_fail:
3653             /* If we might have the case of the German sharp s
3654              * in a casefolding Unicode character class. */
3655
3656             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3657                  locinput += SHARP_S_SKIP;
3658                  nextchr = UCHARAT(locinput);
3659             }
3660             else
3661                  sayNO;
3662             break;
3663         /* Special char classes - The defines start on line 129 or so */
3664         CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3665         CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3666
3667         CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3668         CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3669
3670         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3671         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3672
3673         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3674                        a Unicode extended Grapheme Cluster */
3675             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3676               extended Grapheme Cluster is:
3677
3678                CR LF
3679                | Prepend* Begin Extend*
3680                | .
3681
3682                Begin is (Hangul-syllable | ! Control)
3683                Extend is (Grapheme_Extend | Spacing_Mark)
3684                Control is [ GCB_Control CR LF ]
3685
3686                The discussion below shows how the code for CLUMP is derived
3687                from this regex.  Note that most of these concepts are from
3688                property values of the Grapheme Cluster Boundary (GCB) property.
3689                No code point can have multiple property values for a given
3690                property.  Thus a code point in Prepend can't be in Control, but
3691                it must be in !Control.  This is why Control above includes
3692                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3693                property separately, and so can't be in GCB_Control, even though
3694                they logically are controls.  Control is not the same as gc=cc,
3695                but includes format and other characters as well.
3696
3697                The Unicode definition of Hangul-syllable is:
3698                    L+
3699                    | (L* ( ( V | LV ) V* | LVT ) T*)
3700                    | T+
3701                   )
3702                Each of these is a value for the GCB property, and hence must be
3703                disjoint, so the order they are tested is immaterial, so the
3704                above can safely be changed to
3705                    T+
3706                    | L+
3707                    | (L* ( LVT | ( V | LV ) V*) T*)
3708
3709                The last two terms can be combined like this:
3710                    L* ( L
3711                         | (( LVT | ( V | LV ) V*) T*))
3712
3713                And refactored into this:
3714                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3715
3716                That means that if we have seen any L's at all we can quit
3717                there, but if the next character is a LVT, a V or and LV we
3718                should keep going.
3719
3720                There is a subtlety with Prepend* which showed up in testing.
3721                Note that the Begin, and only the Begin is required in:
3722                 | Prepend* Begin Extend*
3723                Also, Begin contains '! Control'.  A Prepend must be a '!
3724                Control', which means it must be a Begin.  What it comes down to
3725                is that if we match Prepend* and then find no suitable Begin
3726                afterwards, that if we backtrack the last Prepend, that one will
3727                be a suitable Begin.
3728             */
3729
3730             if (locinput >= PL_regeol)
3731                 sayNO;
3732             if  (! utf8_target) {
3733
3734                 /* Match either CR LF  or '.', as all the other possibilities
3735                  * require utf8 */
3736                 locinput++;         /* Match the . or CR */
3737                 if (nextchr == '\r'
3738                     && locinput < PL_regeol
3739                     && UCHARAT(locinput) == '\n') locinput++;
3740             }
3741             else {
3742
3743                 /* Utf8: See if is ( CR LF ); already know that locinput <
3744                  * PL_regeol, so locinput+1 is in bounds */
3745                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3746                     locinput += 2;
3747                 }
3748                 else {
3749                     /* In case have to backtrack to beginning, then match '.' */
3750                     char *starting = locinput;
3751
3752                     /* In case have to backtrack the last prepend */
3753                     char *previous_prepend = 0;
3754
3755                     LOAD_UTF8_CHARCLASS_GCB();
3756
3757                     /* Match (prepend)* */
3758                     while (locinput < PL_regeol
3759                            && swash_fetch(PL_utf8_X_prepend,
3760                                           (U8*)locinput, utf8_target))
3761                     {
3762                         previous_prepend = locinput;
3763                         locinput += UTF8SKIP(locinput);
3764                     }
3765
3766                     /* As noted above, if we matched a prepend character, but
3767                      * the next thing won't match, back off the last prepend we
3768                      * matched, as it is guaranteed to match the begin */
3769                     if (previous_prepend
3770                         && (locinput >=  PL_regeol
3771                             || ! swash_fetch(PL_utf8_X_begin,
3772                                              (U8*)locinput, utf8_target)))
3773                     {
3774                         locinput = previous_prepend;
3775                     }
3776
3777                     /* Note that here we know PL_regeol > locinput, as we
3778                      * tested that upon input to this switch case, and if we
3779                      * moved locinput forward, we tested the result just above
3780                      * and it either passed, or we backed off so that it will
3781                      * now pass */
3782                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3783
3784                         /* Here did not match the required 'Begin' in the
3785                          * second term.  So just match the very first
3786                          * character, the '.' of the final term of the regex */
3787                         locinput = starting + UTF8SKIP(starting);
3788                     } else {
3789
3790                         /* Here is the beginning of a character that can have
3791                          * an extender.  It is either a hangul syllable, or a
3792                          * non-control */
3793                         if (swash_fetch(PL_utf8_X_non_hangul,
3794                                         (U8*)locinput, utf8_target))
3795                         {
3796
3797                             /* Here not a Hangul syllable, must be a
3798                              * ('!  * Control') */
3799                             locinput += UTF8SKIP(locinput);
3800                         } else {
3801
3802                             /* Here is a Hangul syllable.  It can be composed
3803                              * of several individual characters.  One
3804                              * possibility is T+ */
3805                             if (swash_fetch(PL_utf8_X_T,
3806                                             (U8*)locinput, utf8_target))
3807                             {
3808                                 while (locinput < PL_regeol
3809                                         && swash_fetch(PL_utf8_X_T,
3810                                                         (U8*)locinput, utf8_target))
3811                                 {
3812                                     locinput += UTF8SKIP(locinput);
3813                                 }
3814                             } else {
3815
3816                                 /* Here, not T+, but is a Hangul.  That means
3817                                  * it is one of the others: L, LV, LVT or V,
3818                                  * and matches:
3819                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3820
3821                                 /* Match L*           */
3822                                 while (locinput < PL_regeol
3823                                         && swash_fetch(PL_utf8_X_L,
3824                                                         (U8*)locinput, utf8_target))
3825                                 {
3826                                     locinput += UTF8SKIP(locinput);
3827                                 }
3828
3829                                 /* Here, have exhausted L*.  If the next
3830                                  * character is not an LV, LVT nor V, it means
3831                                  * we had to have at least one L, so matches L+
3832                                  * in the original equation, we have a complete
3833                                  * hangul syllable.  Are done. */
3834
3835                                 if (locinput < PL_regeol
3836                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3837                                                     (U8*)locinput, utf8_target))
3838                                 {
3839
3840                                     /* Otherwise keep going.  Must be LV, LVT
3841                                      * or V.  See if LVT */
3842                                     if (swash_fetch(PL_utf8_X_LVT,
3843                                                     (U8*)locinput, utf8_target))
3844                                     {
3845                                         locinput += UTF8SKIP(locinput);
3846                                     } else {
3847
3848                                         /* Must be  V or LV.  Take it, then
3849                                          * match V*     */
3850                                         locinput += UTF8SKIP(locinput);
3851                                         while (locinput < PL_regeol
3852                                                 && swash_fetch(PL_utf8_X_V,
3853                                                          (U8*)locinput, utf8_target))
3854                                         {
3855                                             locinput += UTF8SKIP(locinput);
3856                                         }
3857                                     }
3858
3859                                     /* And any of LV, LVT, or V can be followed
3860                                      * by T*            */
3861                                     while (locinput < PL_regeol
3862                                            && swash_fetch(PL_utf8_X_T,
3863                                                            (U8*)locinput,
3864                                                            utf8_target))
3865                                     {
3866                                         locinput += UTF8SKIP(locinput);
3867                                     }
3868                                 }
3869                             }
3870                         }
3871
3872                         /* Match any extender */
3873                         while (locinput < PL_regeol
3874                                 && swash_fetch(PL_utf8_X_extend,
3875                                                 (U8*)locinput, utf8_target))
3876                         {
3877                             locinput += UTF8SKIP(locinput);
3878                         }
3879                     }
3880                 }
3881                 if (locinput > PL_regeol) sayNO;
3882             }
3883             nextchr = UCHARAT(locinput);
3884             break;
3885
3886         case NREFFL:
3887         {
3888             char *s;
3889             char type;
3890             PL_reg_flags |= RF_tainted;
3891             /* FALL THROUGH */
3892         case NREF:
3893         case NREFF:
3894             type = OP(scan);
3895             n = reg_check_named_buff_matched(rex,scan);
3896
3897             if ( n ) {
3898                 type = REF + ( type - NREF );
3899                 goto do_ref;
3900             } else {
3901                 sayNO;
3902             }
3903             /* unreached */
3904         case REFFL:
3905             PL_reg_flags |= RF_tainted;
3906             /* FALL THROUGH */
3907         case REF:
3908         case REFF:
3909             n = ARG(scan);  /* which paren pair */
3910             type = OP(scan);
3911           do_ref:
3912             ln = PL_regoffs[n].start;
3913             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
3914             if (*PL_reglastparen < n || ln == -1)
3915                 sayNO;                  /* Do not match unless seen CLOSEn. */
3916             if (ln == PL_regoffs[n].end)
3917                 break;
3918
3919             s = PL_bostr + ln;
3920             if (utf8_target && type != REF) {   /* REF can do byte comparison */
3921                 char *l = locinput;
3922                 const char *e = PL_bostr + PL_regoffs[n].end;
3923                 /*
3924                  * Note that we can't do the "other character" lookup trick as
3925                  * in the 8-bit case (no pun intended) because in Unicode we
3926                  * have to map both upper and title case to lower case.
3927                  */
3928                 if (type == REFF) {
3929                     while (s < e) {
3930                         STRLEN ulen1, ulen2;
3931                         U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3932                         U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3933
3934                         if (l >= PL_regeol)
3935                             sayNO;
3936                         toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3937                         toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3938                         if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3939                             sayNO;
3940                         s += ulen1;
3941                         l += ulen2;
3942                     }
3943                 }
3944                 locinput = l;
3945                 nextchr = UCHARAT(locinput);
3946                 break;
3947             }
3948
3949             /* Inline the first character, for speed. */
3950             if (UCHARAT(s) != nextchr &&
3951                 (type == REF ||
3952                  (UCHARAT(s) != (type == REFF
3953                                   ? PL_fold : PL_fold_locale)[nextchr])))
3954                 sayNO;
3955             ln = PL_regoffs[n].end - ln;
3956             if (locinput + ln > PL_regeol)
3957                 sayNO;
3958             if (ln > 1 && (type == REF
3959                            ? memNE(s, locinput, ln)
3960                            : (type == REFF
3961                               ? ! foldEQ(s, locinput, ln)
3962                               : ! foldEQ_locale(s, locinput, ln))))
3963                 sayNO;
3964             locinput += ln;
3965             nextchr = UCHARAT(locinput);
3966             break;
3967         }
3968         case NOTHING:
3969         case TAIL:
3970             break;
3971         case BACK:
3972             break;
3973
3974 #undef  ST
3975 #define ST st->u.eval
3976         {
3977             SV *ret;
3978             REGEXP *re_sv;
3979             regexp *re;
3980             regexp_internal *rei;
3981             regnode *startpoint;
3982
3983         case GOSTART:
3984         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3985             if (cur_eval && cur_eval->locinput==locinput) {
3986                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3987                     Perl_croak(aTHX_ "Infinite recursion in regex");
3988                 if ( ++nochange_depth > max_nochange_depth )
3989                     Perl_croak(aTHX_
3990                         "Pattern subroutine nesting without pos change"
3991                         " exceeded limit in regex");
3992             } else {
3993                 nochange_depth = 0;
3994             }
3995             re_sv = rex_sv;
3996             re = rex;
3997             rei = rexi;
3998             (void)ReREFCNT_inc(rex_sv);
3999             if (OP(scan)==GOSUB) {
4000                 startpoint = scan + ARG2L(scan);
4001                 ST.close_paren = ARG(scan);
4002             } else {
4003                 startpoint = rei->program+1;
4004                 ST.close_paren = 0;
4005             }
4006             goto eval_recurse_doit;
4007             /* NOTREACHED */
4008         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4009             if (cur_eval && cur_eval->locinput==locinput) {
4010                 if ( ++nochange_depth > max_nochange_depth )
4011                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4012             } else {
4013                 nochange_depth = 0;
4014             }
4015             {
4016                 /* execute the code in the {...} */
4017                 dSP;
4018                 SV ** const before = SP;
4019                 OP_4tree * const oop = PL_op;
4020                 COP * const ocurcop = PL_curcop;
4021                 PAD *old_comppad;
4022                 char *saved_regeol = PL_regeol;
4023                 struct re_save_state saved_state;
4024
4025                 /* To not corrupt the existing regex state while executing the
4026                  * eval we would normally put it on the save stack, like with
4027                  * save_re_context. However, re-evals have a weird scoping so we
4028                  * can't just add ENTER/LEAVE here. With that, things like
4029                  *
4030                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4031                  *
4032                  * would break, as they expect the localisation to be unwound
4033                  * only when the re-engine backtracks through the bit that
4034                  * localised it.
4035                  *
4036                  * What we do instead is just saving the state in a local c
4037                  * variable.
4038                  */
4039                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4040
4041                 n = ARG(scan);
4042                 PL_op = (OP_4tree*)rexi->data->data[n];
4043                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4044                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4045                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4046                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4047
4048                 if (sv_yes_mark) {
4049                     SV *sv_mrk = get_sv("REGMARK", 1);
4050                     sv_setsv(sv_mrk, sv_yes_mark);
4051                 }
4052
4053                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4054                 SPAGAIN;
4055                 if (SP == before)
4056                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4057                 else {
4058                     ret = POPs;
4059                     PUTBACK;
4060                 }
4061
4062                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4063
4064                 PL_op = oop;
4065                 PAD_RESTORE_LOCAL(old_comppad);
4066                 PL_curcop = ocurcop;
4067                 PL_regeol = saved_regeol;
4068                 if (!logical) {
4069                     /* /(?{...})/ */
4070                     sv_setsv(save_scalar(PL_replgv), ret);
4071                     break;
4072                 }
4073             }
4074             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4075                 logical = 0;
4076                 {
4077                     /* extract RE object from returned value; compiling if
4078                      * necessary */
4079                     MAGIC *mg = NULL;
4080                     REGEXP *rx = NULL;
4081
4082                     if (SvROK(ret)) {
4083                         SV *const sv = SvRV(ret);
4084
4085                         if (SvTYPE(sv) == SVt_REGEXP) {
4086                             rx = (REGEXP*) sv;
4087                         } else if (SvSMAGICAL(sv)) {
4088                             mg = mg_find(sv, PERL_MAGIC_qr);
4089                             assert(mg);
4090                         }
4091                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4092                         rx = (REGEXP*) ret;
4093                     } else if (SvSMAGICAL(ret)) {
4094                         if (SvGMAGICAL(ret)) {
4095                             /* I don't believe that there is ever qr magic
4096                                here.  */
4097                             assert(!mg_find(ret, PERL_MAGIC_qr));
4098                             sv_unmagic(ret, PERL_MAGIC_qr);
4099                         }
4100                         else {
4101                             mg = mg_find(ret, PERL_MAGIC_qr);
4102                             /* testing suggests mg only ends up non-NULL for
4103                                scalars who were upgraded and compiled in the
4104                                else block below. In turn, this is only
4105                                triggered in the "postponed utf8 string" tests
4106                                in t/op/pat.t  */
4107                         }
4108                     }
4109
4110                     if (mg) {
4111                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4112                         assert(rx);
4113                     }
4114                     if (rx) {
4115                         rx = reg_temp_copy(NULL, rx);
4116                     }
4117                     else {
4118                         U32 pm_flags = 0;
4119                         const I32 osize = PL_regsize;
4120
4121                         if (DO_UTF8(ret)) {
4122                             assert (SvUTF8(ret));
4123                         } else if (SvUTF8(ret)) {
4124                             /* Not doing UTF-8, despite what the SV says. Is
4125                                this only if we're trapped in use 'bytes'?  */
4126                             /* Make a copy of the octet sequence, but without
4127                                the flag on, as the compiler now honours the
4128                                SvUTF8 flag on ret.  */
4129                             STRLEN len;
4130                             const char *const p = SvPV(ret, len);
4131                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4132                         }
4133                         rx = CALLREGCOMP(ret, pm_flags);
4134                         if (!(SvFLAGS(ret)
4135                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4136                                  | SVs_GMG))) {
4137                             /* This isn't a first class regexp. Instead, it's
4138                                caching a regexp onto an existing, Perl visible
4139                                scalar.  */
4140                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4141                         }
4142                         PL_regsize = osize;
4143                     }
4144                     re_sv = rx;
4145                     re = (struct regexp *)SvANY(rx);
4146                 }
4147                 RXp_MATCH_COPIED_off(re);
4148                 re->subbeg = rex->subbeg;
4149                 re->sublen = rex->sublen;
4150                 rei = RXi_GET(re);
4151                 DEBUG_EXECUTE_r(
4152                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4153                         "Matching embedded");
4154                 );
4155                 startpoint = rei->program + 1;
4156                 ST.close_paren = 0; /* only used for GOSUB */
4157                 /* borrowed from regtry */
4158                 if (PL_reg_start_tmpl <= re->nparens) {
4159                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4160                     if(PL_reg_start_tmp)
4161                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4162                     else
4163                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4164                 }
4165
4166         eval_recurse_doit: /* Share code with GOSUB below this line */
4167                 /* run the pattern returned from (??{...}) */
4168                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4169                 REGCP_SET(ST.lastcp);
4170
4171                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4172
4173                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4174                 PL_reglastparen = &re->lastparen;
4175                 PL_reglastcloseparen = &re->lastcloseparen;
4176                 re->lastparen = 0;
4177                 re->lastcloseparen = 0;
4178
4179                 PL_reginput = locinput;
4180                 PL_regsize = 0;
4181
4182                 /* XXXX This is too dramatic a measure... */
4183                 PL_reg_maxiter = 0;
4184
4185                 ST.toggle_reg_flags = PL_reg_flags;
4186                 if (RX_UTF8(re_sv))
4187                     PL_reg_flags |= RF_utf8;
4188                 else
4189                     PL_reg_flags &= ~RF_utf8;
4190                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4191
4192                 ST.prev_rex = rex_sv;
4193                 ST.prev_curlyx = cur_curlyx;
4194                 SETREX(rex_sv,re_sv);
4195                 rex = re;
4196                 rexi = rei;
4197                 cur_curlyx = NULL;
4198                 ST.B = next;
4199                 ST.prev_eval = cur_eval;
4200                 cur_eval = st;
4201                 /* now continue from first node in postoned RE */
4202                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4203                 /* NOTREACHED */
4204             }
4205             /* logical is 1,   /(?(?{...})X|Y)/ */
4206             sw = cBOOL(SvTRUE(ret));
4207             logical = 0;
4208             break;
4209         }
4210
4211         case EVAL_AB: /* cleanup after a successful (??{A})B */
4212             /* note: this is called twice; first after popping B, then A */
4213             PL_reg_flags ^= ST.toggle_reg_flags;
4214             ReREFCNT_dec(rex_sv);
4215             SETREX(rex_sv,ST.prev_rex);
4216             rex = (struct regexp *)SvANY(rex_sv);
4217             rexi = RXi_GET(rex);
4218             regcpblow(ST.cp);
4219             cur_eval = ST.prev_eval;
4220             cur_curlyx = ST.prev_curlyx;
4221
4222             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4223             PL_reglastparen = &rex->lastparen;
4224             PL_reglastcloseparen = &rex->lastcloseparen;
4225             /* also update PL_regoffs */
4226             PL_regoffs = rex->offs;
4227
4228             /* XXXX This is too dramatic a measure... */
4229             PL_reg_maxiter = 0;
4230             if ( nochange_depth )
4231                 nochange_depth--;
4232             sayYES;
4233
4234
4235         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4236             /* note: this is called twice; first after popping B, then A */
4237             PL_reg_flags ^= ST.toggle_reg_flags;
4238             ReREFCNT_dec(rex_sv);
4239             SETREX(rex_sv,ST.prev_rex);
4240             rex = (struct regexp *)SvANY(rex_sv);
4241             rexi = RXi_GET(rex);
4242             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4243             PL_reglastparen = &rex->lastparen;
4244             PL_reglastcloseparen = &rex->lastcloseparen;
4245
4246             PL_reginput = locinput;
4247             REGCP_UNWIND(ST.lastcp);
4248             regcppop(rex);
4249             cur_eval = ST.prev_eval;
4250             cur_curlyx = ST.prev_curlyx;
4251             /* XXXX This is too dramatic a measure... */
4252             PL_reg_maxiter = 0;
4253             if ( nochange_depth )
4254                 nochange_depth--;
4255             sayNO_SILENT;
4256 #undef ST
4257
4258         case OPEN:
4259             n = ARG(scan);  /* which paren pair */
4260             PL_reg_start_tmp[n] = locinput;
4261             if (n > PL_regsize)
4262                 PL_regsize = n;
4263             lastopen = n;
4264             break;
4265         case CLOSE:
4266             n = ARG(scan);  /* which paren pair */
4267             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4268             PL_regoffs[n].end = locinput - PL_bostr;
4269             /*if (n > PL_regsize)
4270                 PL_regsize = n;*/
4271             if (n > *PL_reglastparen)
4272                 *PL_reglastparen = n;
4273             *PL_reglastcloseparen = n;
4274             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4275                 goto fake_end;
4276             }
4277             break;
4278         case ACCEPT:
4279             if (ARG(scan)){
4280                 regnode *cursor;
4281                 for (cursor=scan;
4282                      cursor && OP(cursor)!=END;
4283                      cursor=regnext(cursor))
4284                 {
4285                     if ( OP(cursor)==CLOSE ){
4286                         n = ARG(cursor);
4287                         if ( n <= lastopen ) {
4288                             PL_regoffs[n].start
4289                                 = PL_reg_start_tmp[n] - PL_bostr;
4290                             PL_regoffs[n].end = locinput - PL_bostr;
4291                             /*if (n > PL_regsize)
4292                             PL_regsize = n;*/
4293                             if (n > *PL_reglastparen)
4294                                 *PL_reglastparen = n;
4295                             *PL_reglastcloseparen = n;
4296                             if ( n == ARG(scan) || (cur_eval &&
4297                                 cur_eval->u.eval.close_paren == n))
4298                                 break;
4299                         }
4300                     }
4301                 }
4302             }
4303             goto fake_end;
4304             /*NOTREACHED*/
4305         case GROUPP:
4306             n = ARG(scan);  /* which paren pair */
4307             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4308             break;
4309         case NGROUPP:
4310             /* reg_check_named_buff_matched returns 0 for no match */
4311             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4312             break;
4313         case INSUBP:
4314             n = ARG(scan);
4315             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4316             break;
4317         case DEFINEP:
4318             sw = 0;
4319             break;
4320         case IFTHEN:
4321             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4322             if (sw)
4323                 next = NEXTOPER(NEXTOPER(scan));
4324             else {
4325                 next = scan + ARG(scan);
4326                 if (OP(next) == IFTHEN) /* Fake one. */
4327                     next = NEXTOPER(NEXTOPER(next));
4328             }
4329             break;
4330         case LOGICAL:
4331             logical = scan->flags;
4332             break;
4333
4334 /*******************************************************************
4335
4336 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4337 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4338 STAR/PLUS/CURLY/CURLYN are used instead.)
4339
4340 A*B is compiled as <CURLYX><A><WHILEM><B>
4341
4342 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4343 state, which contains the current count, initialised to -1. It also sets
4344 cur_curlyx to point to this state, with any previous value saved in the
4345 state block.
4346
4347 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4348 since the pattern may possibly match zero times (i.e. it's a while {} loop
4349 rather than a do {} while loop).
4350
4351 Each entry to WHILEM represents a successful match of A. The count in the
4352 CURLYX block is incremented, another WHILEM state is pushed, and execution
4353 passes to A or B depending on greediness and the current count.
4354
4355 For example, if matching against the string a1a2a3b (where the aN are
4356 substrings that match /A/), then the match progresses as follows: (the
4357 pushed states are interspersed with the bits of strings matched so far):
4358
4359     <CURLYX cnt=-1>
4360     <CURLYX cnt=0><WHILEM>
4361     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4362     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4363     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4364     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4365
4366 (Contrast this with something like CURLYM, which maintains only a single
4367 backtrack state:
4368
4369     <CURLYM cnt=0> a1
4370     a1 <CURLYM cnt=1> a2
4371     a1 a2 <CURLYM cnt=2> a3
4372     a1 a2 a3 <CURLYM cnt=3> b
4373 )
4374
4375 Each WHILEM state block marks a point to backtrack to upon partial failure
4376 of A or B, and also contains some minor state data related to that
4377 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4378 overall state, such as the count, and pointers to the A and B ops.
4379
4380 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4381 must always point to the *current* CURLYX block, the rules are:
4382
4383 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4384 and set cur_curlyx to point the new block.
4385
4386 When popping the CURLYX block after a successful or unsuccessful match,
4387 restore the previous cur_curlyx.
4388
4389 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4390 to the outer one saved in the CURLYX block.
4391
4392 When popping the WHILEM block after a successful or unsuccessful B match,
4393 restore the previous cur_curlyx.
4394
4395 Here's an example for the pattern (AI* BI)*BO
4396 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4397
4398 cur_
4399 curlyx backtrack stack
4400 ------ ---------------
4401 NULL
4402 CO     <CO prev=NULL> <WO>
4403 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4404 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4405 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4406
4407 At this point the pattern succeeds, and we work back down the stack to
4408 clean up, restoring as we go:
4409
4410 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4411 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4412 CO     <CO prev=NULL> <WO>
4413 NULL
4414
4415 *******************************************************************/
4416
4417 #define ST st->u.curlyx
4418
4419         case CURLYX:    /* start of /A*B/  (for complex A) */
4420         {
4421             /* No need to save/restore up to this paren */
4422             I32 parenfloor = scan->flags;
4423
4424             assert(next); /* keep Coverity happy */
4425             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4426                 next += ARG(next);
4427
4428             /* XXXX Probably it is better to teach regpush to support
4429                parenfloor > PL_regsize... */
4430             if (parenfloor > (I32)*PL_reglastparen)
4431                 parenfloor = *PL_reglastparen; /* Pessimization... */
4432
4433             ST.prev_curlyx= cur_curlyx;
4434             cur_curlyx = st;
4435             ST.cp = PL_savestack_ix;
4436
4437             /* these fields contain the state of the current curly.
4438              * they are accessed by subsequent WHILEMs */
4439             ST.parenfloor = parenfloor;
4440             ST.me = scan;
4441             ST.B = next;
4442             ST.minmod = minmod;
4443             minmod = 0;
4444             ST.count = -1;      /* this will be updated by WHILEM */
4445             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4446
4447             PL_reginput = locinput;
4448             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4449             /* NOTREACHED */
4450         }
4451
4452         case CURLYX_end: /* just finished matching all of A*B */
4453             cur_curlyx = ST.prev_curlyx;
4454             sayYES;
4455             /* NOTREACHED */
4456
4457         case CURLYX_end_fail: /* just failed to match all of A*B */
4458             regcpblow(ST.cp);
4459             cur_curlyx = ST.prev_curlyx;
4460             sayNO;
4461             /* NOTREACHED */
4462
4463
4464 #undef ST
4465 #define ST st->u.whilem
4466
4467         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4468         {
4469             /* see the discussion above about CURLYX/WHILEM */
4470             I32 n;
4471             int min = ARG1(cur_curlyx->u.curlyx.me);
4472             int max = ARG2(cur_curlyx->u.curlyx.me);
4473             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4474
4475             assert(cur_curlyx); /* keep Coverity happy */
4476             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4477             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4478             ST.cache_offset = 0;
4479             ST.cache_mask = 0;
4480
4481             PL_reginput = locinput;
4482
4483             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4484                   "%*s  whilem: matched %ld out of %d..%d\n",
4485                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4486             );
4487
4488             /* First just match a string of min A's. */
4489
4490             if (n < min) {
4491                 cur_curlyx->u.curlyx.lastloc = locinput;
4492                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4493                 /* NOTREACHED */
4494             }
4495
4496             /* If degenerate A matches "", assume A done. */
4497
4498             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4499                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4500                    "%*s  whilem: empty match detected, trying continuation...\n",
4501                    REPORT_CODE_OFF+depth*2, "")
4502                 );
4503                 goto do_whilem_B_max;
4504             }
4505
4506             /* super-linear cache processing */
4507
4508             if (scan->flags) {
4509
4510                 if (!PL_reg_maxiter) {
4511                     /* start the countdown: Postpone detection until we
4512                      * know the match is not *that* much linear. */
4513                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4514                     /* possible overflow for long strings and many CURLYX's */
4515                     if (PL_reg_maxiter < 0)
4516                         PL_reg_maxiter = I32_MAX;
4517                     PL_reg_leftiter = PL_reg_maxiter;
4518                 }
4519
4520                 if (PL_reg_leftiter-- == 0) {
4521                     /* initialise cache */
4522                     const I32 size = (PL_reg_maxiter + 7)/8;
4523                     if (PL_reg_poscache) {
4524                         if ((I32)PL_reg_poscache_size < size) {
4525                             Renew(PL_reg_poscache, size, char);
4526                             PL_reg_poscache_size = size;
4527                         }
4528                         Zero(PL_reg_poscache, size, char);
4529                     }
4530                     else {
4531                         PL_reg_poscache_size = size;
4532                         Newxz(PL_reg_poscache, size, char);
4533                     }
4534                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4535       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4536                               PL_colors[4], PL_colors[5])
4537                     );
4538                 }
4539
4540                 if (PL_reg_leftiter < 0) {
4541                     /* have we already failed at this position? */
4542                     I32 offset, mask;
4543                     offset  = (scan->flags & 0xf) - 1
4544                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4545                     mask    = 1 << (offset % 8);
4546                     offset /= 8;
4547                     if (PL_reg_poscache[offset] & mask) {
4548                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4549                             "%*s  whilem: (cache) already tried at this position...\n",
4550                             REPORT_CODE_OFF+depth*2, "")
4551                         );
4552                         sayNO; /* cache records failure */
4553                     }
4554                     ST.cache_offset = offset;
4555                     ST.cache_mask   = mask;
4556                 }
4557             }
4558
4559             /* Prefer B over A for minimal matching. */
4560
4561             if (cur_curlyx->u.curlyx.minmod) {
4562                 ST.save_curlyx = cur_curlyx;
4563                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4564                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4565                 REGCP_SET(ST.lastcp);
4566                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4567                 /* NOTREACHED */
4568             }
4569
4570             /* Prefer A over B for maximal matching. */
4571
4572             if (n < max) { /* More greed allowed? */
4573                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4574                 cur_curlyx->u.curlyx.lastloc = locinput;
4575                 REGCP_SET(ST.lastcp);
4576                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4577                 /* NOTREACHED */
4578             }
4579             goto do_whilem_B_max;
4580         }
4581         /* NOTREACHED */
4582
4583         case WHILEM_B_min: /* just matched B in a minimal match */
4584         case WHILEM_B_max: /* just matched B in a maximal match */
4585             cur_curlyx = ST.save_curlyx;
4586             sayYES;
4587             /* NOTREACHED */
4588
4589         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4590             cur_curlyx = ST.save_curlyx;
4591             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4592             cur_curlyx->u.curlyx.count--;
4593             CACHEsayNO;
4594             /* NOTREACHED */
4595
4596         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4597             REGCP_UNWIND(ST.lastcp);
4598             regcppop(rex);
4599             /* FALL THROUGH */
4600         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4601             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4602             cur_curlyx->u.curlyx.count--;
4603             CACHEsayNO;
4604             /* NOTREACHED */
4605
4606         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4607             REGCP_UNWIND(ST.lastcp);
4608             regcppop(rex);      /* Restore some previous $<digit>s? */
4609             PL_reginput = locinput;
4610             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4611                 "%*s  whilem: failed, trying continuation...\n",
4612                 REPORT_CODE_OFF+depth*2, "")
4613             );
4614           do_whilem_B_max:
4615             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4616                 && ckWARN(WARN_REGEXP)
4617                 && !(PL_reg_flags & RF_warned))
4618             {
4619                 PL_reg_flags |= RF_warned;
4620                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4621                      "Complex regular subexpression recursion",
4622                      REG_INFTY - 1);
4623             }
4624
4625             /* now try B */
4626             ST.save_curlyx = cur_curlyx;
4627             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4628             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4629             /* NOTREACHED */
4630
4631         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4632             cur_curlyx = ST.save_curlyx;
4633             REGCP_UNWIND(ST.lastcp);
4634             regcppop(rex);
4635
4636             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4637                 /* Maximum greed exceeded */
4638                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4639                     && ckWARN(WARN_REGEXP)
4640                     && !(PL_reg_flags & RF_warned))
4641                 {
4642                     PL_reg_flags |= RF_warned;
4643                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4644                         "%s limit (%d) exceeded",
4645                         "Complex regular subexpression recursion",
4646                         REG_INFTY - 1);
4647                 }
4648                 cur_curlyx->u.curlyx.count--;
4649                 CACHEsayNO;
4650             }
4651
4652             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4653                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4654             );
4655             /* Try grabbing another A and see if it helps. */
4656             PL_reginput = locinput;
4657             cur_curlyx->u.curlyx.lastloc = locinput;
4658             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4659             REGCP_SET(ST.lastcp);
4660             PUSH_STATE_GOTO(WHILEM_A_min,
4661                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4662             /* NOTREACHED */
4663
4664 #undef  ST
4665 #define ST st->u.branch
4666
4667         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4668             next = scan + ARG(scan);
4669             if (next == scan)
4670                 next = NULL;
4671             scan = NEXTOPER(scan);
4672             /* FALL THROUGH */
4673
4674         case BRANCH:        /*  /(...|A|...)/ */
4675             scan = NEXTOPER(scan); /* scan now points to inner node */
4676             ST.lastparen = *PL_reglastparen;
4677             ST.next_branch = next;
4678             REGCP_SET(ST.cp);
4679             PL_reginput = locinput;
4680
4681             /* Now go into the branch */
4682             if (has_cutgroup) {
4683                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4684             } else {
4685                 PUSH_STATE_GOTO(BRANCH_next, scan);
4686             }
4687             /* NOTREACHED */
4688         case CUTGROUP:
4689             PL_reginput = locinput;
4690             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4691                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4692             PUSH_STATE_GOTO(CUTGROUP_next,next);
4693             /* NOTREACHED */
4694         case CUTGROUP_next_fail:
4695             do_cutgroup = 1;
4696             no_final = 1;
4697             if (st->u.mark.mark_name)
4698                 sv_commit = st->u.mark.mark_name;
4699             sayNO;
4700             /* NOTREACHED */
4701         case BRANCH_next:
4702             sayYES;
4703             /* NOTREACHED */
4704         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4705             if (do_cutgroup) {
4706                 do_cutgroup = 0;
4707                 no_final = 0;
4708             }
4709             REGCP_UNWIND(ST.cp);
4710             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4711                 PL_regoffs[n].end = -1;
4712             *PL_reglastparen = n;
4713             /*dmq: *PL_reglastcloseparen = n; */
4714             scan = ST.next_branch;
4715             /* no more branches? */
4716             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4717                 DEBUG_EXECUTE_r({
4718                     PerlIO_printf( Perl_debug_log,
4719                         "%*s  %sBRANCH failed...%s\n",
4720                         REPORT_CODE_OFF+depth*2, "",
4721                         PL_colors[4],
4722                         PL_colors[5] );
4723                 });
4724                 sayNO_SILENT;
4725             }
4726             continue; /* execute next BRANCH[J] op */
4727             /* NOTREACHED */
4728
4729         case MINMOD:
4730             minmod = 1;
4731             break;
4732
4733 #undef  ST
4734 #define ST st->u.curlym
4735
4736         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4737
4738             /* This is an optimisation of CURLYX that enables us to push
4739              * only a single backtracking state, no matter how many matches
4740              * there are in {m,n}. It relies on the pattern being constant
4741              * length, with no parens to influence future backrefs
4742              */
4743
4744             ST.me = scan;
4745             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4746
4747             /* if paren positive, emulate an OPEN/CLOSE around A */
4748             if (ST.me->flags) {
4749                 U32 paren = ST.me->flags;
4750                 if (paren > PL_regsize)
4751                     PL_regsize = paren;
4752                 if (paren > *PL_reglastparen)
4753                     *PL_reglastparen = paren;
4754                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4755             }
4756             ST.A = scan;
4757             ST.B = next;
4758             ST.alen = 0;
4759             ST.count = 0;
4760             ST.minmod = minmod;
4761             minmod = 0;
4762             ST.c1 = CHRTEST_UNINIT;
4763             REGCP_SET(ST.cp);
4764
4765             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4766                 goto curlym_do_B;
4767
4768           curlym_do_A: /* execute the A in /A{m,n}B/  */
4769             PL_reginput = locinput;
4770             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4771             /* NOTREACHED */
4772
4773         case CURLYM_A: /* we've just matched an A */
4774             locinput = st->locinput;
4775             nextchr = UCHARAT(locinput);
4776
4777             ST.count++;
4778             /* after first match, determine A's length: u.curlym.alen */
4779             if (ST.count == 1) {
4780                 if (PL_reg_match_utf8) {
4781                     char *s = locinput;
4782                     while (s < PL_reginput) {
4783                         ST.alen++;
4784                         s += UTF8SKIP(s);
4785                     }
4786                 }
4787                 else {
4788                     ST.alen = PL_reginput - locinput;
4789                 }
4790                 if (ST.alen == 0)
4791                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4792             }
4793             DEBUG_EXECUTE_r(
4794                 PerlIO_printf(Perl_debug_log,
4795                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4796                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4797                           (IV) ST.count, (IV)ST.alen)
4798             );
4799
4800             locinput = PL_reginput;
4801
4802             if (cur_eval && cur_eval->u.eval.close_paren &&
4803                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4804                 goto fake_end;
4805
4806             {
4807                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4808                 if ( max == REG_INFTY || ST.count < max )
4809                     goto curlym_do_A; /* try to match another A */
4810             }
4811             goto curlym_do_B; /* try to match B */
4812
4813         case CURLYM_A_fail: /* just failed to match an A */
4814             REGCP_UNWIND(ST.cp);
4815
4816             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4817                 || (cur_eval && cur_eval->u.eval.close_paren &&
4818                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4819                 sayNO;
4820
4821           curlym_do_B: /* execute the B in /A{m,n}B/  */
4822             PL_reginput = locinput;
4823             if (ST.c1 == CHRTEST_UNINIT) {
4824                 /* calculate c1 and c2 for possible match of 1st char
4825                  * following curly */
4826                 ST.c1 = ST.c2 = CHRTEST_VOID;
4827                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4828                     regnode *text_node = ST.B;
4829                     if (! HAS_TEXT(text_node))
4830                         FIND_NEXT_IMPT(text_node);
4831                     /* this used to be
4832
4833                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4834
4835                         But the former is redundant in light of the latter.
4836
4837                         if this changes back then the macro for
4838                         IS_TEXT and friends need to change.
4839                      */
4840                     if (PL_regkind[OP(text_node)] == EXACT)
4841                     {
4842
4843                         ST.c1 = (U8)*STRING(text_node);
4844                         ST.c2 =
4845                             (IS_TEXTF(text_node))
4846                             ? PL_fold[ST.c1]
4847                             : (IS_TEXTFL(text_node))
4848                                 ? PL_fold_locale[ST.c1]
4849                                 : ST.c1;
4850                     }
4851                 }
4852             }
4853
4854             DEBUG_EXECUTE_r(
4855                 PerlIO_printf(Perl_debug_log,
4856                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4857                     (int)(REPORT_CODE_OFF+(depth*2)),
4858                     "", (IV)ST.count)
4859                 );
4860             if (ST.c1 != CHRTEST_VOID
4861                     && UCHARAT(PL_reginput) != ST.c1
4862                     && UCHARAT(PL_reginput) != ST.c2)
4863             {
4864                 /* simulate B failing */
4865                 DEBUG_OPTIMISE_r(
4866                     PerlIO_printf(Perl_debug_log,
4867                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4868                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4869                         (IV)ST.c1,(IV)ST.c2
4870                 ));
4871                 state_num = CURLYM_B_fail;
4872                 goto reenter_switch;
4873             }
4874
4875             if (ST.me->flags) {
4876                 /* mark current A as captured */
4877                 I32 paren = ST.me->flags;
4878                 if (ST.count) {
4879                     PL_regoffs[paren].start
4880                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4881                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4882                     /*dmq: *PL_reglastcloseparen = paren; */
4883                 }
4884                 else
4885                     PL_regoffs[paren].end = -1;
4886                 if (cur_eval && cur_eval->u.eval.close_paren &&
4887                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4888                 {
4889                     if (ST.count)
4890                         goto fake_end;
4891                     else
4892                         sayNO;
4893                 }
4894             }
4895
4896             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4897             /* NOTREACHED */
4898
4899         case CURLYM_B_fail: /* just failed to match a B */
4900             REGCP_UNWIND(ST.cp);
4901             if (ST.minmod) {
4902                 I32 max = ARG2(ST.me);
4903                 if (max != REG_INFTY && ST.count == max)
4904                     sayNO;
4905                 goto curlym_do_A; /* try to match a further A */
4906             }
4907             /* backtrack one A */
4908             if (ST.count == ARG1(ST.me) /* min */)
4909                 sayNO;
4910             ST.count--;
4911             locinput = HOPc(locinput, -ST.alen);
4912             goto curlym_do_B; /* try to match B */
4913
4914 #undef ST
4915 #define ST st->u.curly
4916
4917 #define CURLY_SETPAREN(paren, success) \
4918     if (paren) { \
4919         if (success) { \
4920             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4921             PL_regoffs[paren].end = locinput - PL_bostr; \
4922             *PL_reglastcloseparen = paren; \
4923         } \
4924         else \
4925             PL_regoffs[paren].end = -1; \
4926     }
4927
4928         case STAR:              /*  /A*B/ where A is width 1 */
4929             ST.paren = 0;
4930             ST.min = 0;
4931             ST.max = REG_INFTY;
4932             scan = NEXTOPER(scan);
4933             goto repeat;
4934         case PLUS:              /*  /A+B/ where A is width 1 */
4935             ST.paren = 0;
4936             ST.min = 1;
4937             ST.max = REG_INFTY;
4938             scan = NEXTOPER(scan);
4939             goto repeat;
4940         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
4941             ST.paren = scan->flags;     /* Which paren to set */
4942             if (ST.paren > PL_regsize)
4943                 PL_regsize = ST.paren;
4944             if (ST.paren > *PL_reglastparen)
4945                 *PL_reglastparen = ST.paren;
4946             ST.min = ARG1(scan);  /* min to match */
4947             ST.max = ARG2(scan);  /* max to match */
4948             if (cur_eval && cur_eval->u.eval.close_paren &&
4949                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4950                 ST.min=1;
4951                 ST.max=1;
4952             }
4953             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4954             goto repeat;
4955         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
4956             ST.paren = 0;
4957             ST.min = ARG1(scan);  /* min to match */
4958             ST.max = ARG2(scan);  /* max to match */
4959             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4960           repeat:
4961             /*
4962             * Lookahead to avoid useless match attempts
4963             * when we know what character comes next.
4964             *
4965             * Used to only do .*x and .*?x, but now it allows
4966             * for )'s, ('s and (?{ ... })'s to be in the way
4967             * of the quantifier and the EXACT-like node.  -- japhy
4968             */
4969
4970             if (ST.min > ST.max) /* XXX make this a compile-time check? */
4971                 sayNO;
4972             if (HAS_TEXT(next) || JUMPABLE(next)) {
4973                 U8 *s;
4974                 regnode *text_node = next;
4975
4976                 if (! HAS_TEXT(text_node))
4977                     FIND_NEXT_IMPT(text_node);
4978
4979                 if (! HAS_TEXT(text_node))
4980                     ST.c1 = ST.c2 = CHRTEST_VOID;
4981                 else {
4982                     if ( PL_regkind[OP(text_node)] != EXACT ) {
4983                         ST.c1 = ST.c2 = CHRTEST_VOID;
4984                         goto assume_ok_easy;
4985                     }
4986                     else
4987                         s = (U8*)STRING(text_node);
4988
4989                     /*  Currently we only get here when
4990
4991                         PL_rekind[OP(text_node)] == EXACT
4992
4993                         if this changes back then the macro for IS_TEXT and
4994                         friends need to change. */
4995                     if (!UTF_PATTERN) {
4996                         ST.c2 = ST.c1 = *s;
4997                         if (IS_TEXTF(text_node))
4998                             ST.c2 = PL_fold[ST.c1];
4999                         else if (IS_TEXTFL(text_node))
5000                             ST.c2 = PL_fold_locale[ST.c1];
5001                     }
5002                     else { /* UTF_PATTERN */
5003                         if (IS_TEXTF(text_node)) {
5004                              STRLEN ulen1, ulen2;
5005                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5006                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5007
5008                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5009                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5010 #ifdef EBCDIC
5011                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5012                                                     ckWARN(WARN_UTF8) ?
5013                                                     0 : UTF8_ALLOW_ANY);
5014                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5015                                                     ckWARN(WARN_UTF8) ?
5016                                                     0 : UTF8_ALLOW_ANY);
5017 #else
5018                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5019                                                     uniflags);
5020                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5021                                                     uniflags);
5022 #endif
5023                         }
5024                         else {
5025                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5026                                                      uniflags);
5027                         }
5028                     }
5029                 }
5030             }
5031             else
5032                 ST.c1 = ST.c2 = CHRTEST_VOID;
5033         assume_ok_easy:
5034
5035             ST.A = scan;
5036             ST.B = next;
5037             PL_reginput = locinput;
5038             if (minmod) {
5039                 minmod = 0;
5040                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5041                     sayNO;
5042                 ST.count = ST.min;
5043                 locinput = PL_reginput;
5044                 REGCP_SET(ST.cp);
5045                 if (ST.c1 == CHRTEST_VOID)
5046                     goto curly_try_B_min;
5047
5048                 ST.oldloc = locinput;
5049
5050                 /* set ST.maxpos to the furthest point along the
5051                  * string that could possibly match */
5052                 if  (ST.max == REG_INFTY) {
5053                     ST.maxpos = PL_regeol - 1;
5054                     if (utf8_target)
5055                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5056                             ST.maxpos--;
5057                 }
5058                 else if (utf8_target) {
5059                     int m = ST.max - ST.min;
5060                     for (ST.maxpos = locinput;
5061                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5062                         ST.maxpos += UTF8SKIP(ST.maxpos);
5063                 }
5064                 else {
5065                     ST.maxpos = locinput + ST.max - ST.min;
5066                     if (ST.maxpos >= PL_regeol)
5067                         ST.maxpos = PL_regeol - 1;
5068                 }
5069                 goto curly_try_B_min_known;
5070
5071             }
5072             else {
5073                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5074                 locinput = PL_reginput;
5075                 if (ST.count < ST.min)
5076                     sayNO;
5077                 if ((ST.count > ST.min)
5078                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5079                 {
5080                     /* A{m,n} must come at the end of the string, there's
5081                      * no point in backing off ... */
5082                     ST.min = ST.count;
5083                     /* ...except that $ and \Z can match before *and* after
5084                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5085                        We may back off by one in this case. */
5086                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5087                         ST.min--;
5088                 }
5089                 REGCP_SET(ST.cp);
5090                 goto curly_try_B_max;
5091             }
5092             /* NOTREACHED */
5093
5094
5095         case CURLY_B_min_known_fail:
5096             /* failed to find B in a non-greedy match where c1,c2 valid */
5097             if (ST.paren && ST.count)
5098                 PL_regoffs[ST.paren].end = -1;
5099
5100             PL_reginput = locinput;     /* Could be reset... */
5101             REGCP_UNWIND(ST.cp);
5102             /* Couldn't or didn't -- move forward. */
5103             ST.oldloc = locinput;
5104             if (utf8_target)
5105                 locinput += UTF8SKIP(locinput);
5106             else
5107                 locinput++;
5108             ST.count++;
5109           curly_try_B_min_known:
5110              /* find the next place where 'B' could work, then call B */
5111             {
5112                 int n;
5113                 if (utf8_target) {
5114                     n = (ST.oldloc == locinput) ? 0 : 1;
5115                     if (ST.c1 == ST.c2) {
5116                         STRLEN len;
5117                         /* set n to utf8_distance(oldloc, locinput) */
5118                         while (locinput <= ST.maxpos &&
5119                                utf8n_to_uvchr((U8*)locinput,
5120                                               UTF8_MAXBYTES, &len,
5121                                               uniflags) != (UV)ST.c1) {
5122                             locinput += len;
5123                             n++;
5124                         }
5125                     }
5126                     else {
5127                         /* set n to utf8_distance(oldloc, locinput) */
5128                         while (locinput <= ST.maxpos) {
5129                             STRLEN len;
5130                             const UV c = utf8n_to_uvchr((U8*)locinput,
5131                                                   UTF8_MAXBYTES, &len,
5132                                                   uniflags);
5133                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5134                                 break;
5135                             locinput += len;
5136                             n++;
5137                         }
5138                     }
5139                 }
5140                 else {
5141                     if (ST.c1 == ST.c2) {
5142                         while (locinput <= ST.maxpos &&
5143                                UCHARAT(locinput) != ST.c1)
5144                             locinput++;
5145                     }
5146                     else {
5147                         while (locinput <= ST.maxpos
5148                                && UCHARAT(locinput) != ST.c1
5149                                && UCHARAT(locinput) != ST.c2)
5150                             locinput++;
5151                     }
5152                     n = locinput - ST.oldloc;
5153                 }
5154                 if (locinput > ST.maxpos)
5155                     sayNO;
5156                 /* PL_reginput == oldloc now */
5157                 if (n) {
5158                     ST.count += n;
5159                     if (regrepeat(rex, ST.A, n, depth) < n)
5160                         sayNO;
5161                 }
5162                 PL_reginput = locinput;
5163                 CURLY_SETPAREN(ST.paren, ST.count);
5164                 if (cur_eval && cur_eval->u.eval.close_paren &&
5165                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5166                     goto fake_end;
5167                 }
5168                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5169             }
5170             /* NOTREACHED */
5171
5172
5173         case CURLY_B_min_fail:
5174             /* failed to find B in a non-greedy match where c1,c2 invalid */
5175             if (ST.paren && ST.count)
5176                 PL_regoffs[ST.paren].end = -1;
5177
5178             REGCP_UNWIND(ST.cp);
5179             /* failed -- move forward one */
5180             PL_reginput = locinput;
5181             if (regrepeat(rex, ST.A, 1, depth)) {
5182                 ST.count++;
5183                 locinput = PL_reginput;
5184                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5185                         ST.count > 0)) /* count overflow ? */
5186                 {
5187                   curly_try_B_min:
5188                     CURLY_SETPAREN(ST.paren, ST.count);
5189                     if (cur_eval && cur_eval->u.eval.close_paren &&
5190                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5191                         goto fake_end;
5192                     }
5193                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5194                 }
5195             }
5196             sayNO;
5197             /* NOTREACHED */
5198
5199
5200         curly_try_B_max:
5201             /* a successful greedy match: now try to match B */
5202             if (cur_eval && cur_eval->u.eval.close_paren &&
5203                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5204                 goto fake_end;
5205             }
5206             {
5207                 UV c = 0;
5208                 if (ST.c1 != CHRTEST_VOID)
5209                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5210                                            UTF8_MAXBYTES, 0, uniflags)
5211                                 : (UV) UCHARAT(PL_reginput);
5212                 /* If it could work, try it. */
5213                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5214                     CURLY_SETPAREN(ST.paren, ST.count);
5215                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5216                     /* NOTREACHED */
5217                 }
5218             }
5219             /* FALL THROUGH */
5220         case CURLY_B_max_fail:
5221             /* failed to find B in a greedy match */
5222             if (ST.paren && ST.count)
5223                 PL_regoffs[ST.paren].end = -1;
5224
5225             REGCP_UNWIND(ST.cp);
5226             /*  back up. */
5227             if (--ST.count < ST.min)
5228                 sayNO;
5229             PL_reginput = locinput = HOPc(locinput, -1);
5230             goto curly_try_B_max;
5231
5232 #undef ST
5233
5234         case END:
5235             fake_end:
5236             if (cur_eval) {
5237                 /* we've just finished A in /(??{A})B/; now continue with B */
5238                 I32 tmpix;
5239                 st->u.eval.toggle_reg_flags
5240                             = cur_eval->u.eval.toggle_reg_flags;
5241                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5242
5243                 st->u.eval.prev_rex = rex_sv;           /* inner */
5244                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5245                 rex = (struct regexp *)SvANY(rex_sv);
5246                 rexi = RXi_GET(rex);
5247                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5248                 ReREFCNT_inc(rex_sv);
5249                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5250
5251                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5252                 PL_reglastparen = &rex->lastparen;
5253                 PL_reglastcloseparen = &rex->lastcloseparen;
5254
5255                 REGCP_SET(st->u.eval.lastcp);
5256                 PL_reginput = locinput;
5257
5258                 /* Restore parens of the outer rex without popping the
5259                  * savestack */
5260                 tmpix = PL_savestack_ix;
5261                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5262                 regcppop(rex);
5263                 PL_savestack_ix = tmpix;
5264
5265                 st->u.eval.prev_eval = cur_eval;
5266                 cur_eval = cur_eval->u.eval.prev_eval;
5267                 DEBUG_EXECUTE_r(
5268                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5269                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5270                 if ( nochange_depth )
5271                     nochange_depth--;
5272
5273                 PUSH_YES_STATE_GOTO(EVAL_AB,
5274                         st->u.eval.prev_eval->u.eval.B); /* match B */
5275             }
5276
5277             if (locinput < reginfo->till) {
5278                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5279                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5280                                       PL_colors[4],
5281                                       (long)(locinput - PL_reg_starttry),
5282                                       (long)(reginfo->till - PL_reg_starttry),
5283                                       PL_colors[5]));
5284
5285                 sayNO_SILENT;           /* Cannot match: too short. */
5286             }
5287             PL_reginput = locinput;     /* put where regtry can find it */
5288             sayYES;                     /* Success! */
5289
5290         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5291             DEBUG_EXECUTE_r(
5292             PerlIO_printf(Perl_debug_log,
5293                 "%*s  %ssubpattern success...%s\n",
5294                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5295             PL_reginput = locinput;     /* put where regtry can find it */
5296             sayYES;                     /* Success! */
5297
5298 #undef  ST
5299 #define ST st->u.ifmatch
5300
5301         case SUSPEND:   /* (?>A) */
5302             ST.wanted = 1;
5303             PL_reginput = locinput;
5304             goto do_ifmatch;
5305
5306         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5307             ST.wanted = 0;
5308             goto ifmatch_trivial_fail_test;
5309
5310         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5311             ST.wanted = 1;
5312           ifmatch_trivial_fail_test:
5313             if (scan->flags) {
5314                 char * const s = HOPBACKc(locinput, scan->flags);
5315                 if (!s) {
5316                     /* trivial fail */
5317                     if (logical) {
5318                         logical = 0;
5319                         sw = 1 - cBOOL(ST.wanted);
5320                     }
5321                     else if (ST.wanted)
5322                         sayNO;
5323                     next = scan + ARG(scan);
5324                     if (next == scan)
5325                         next = NULL;
5326                     break;
5327                 }
5328                 PL_reginput = s;
5329             }
5330             else
5331                 PL_reginput = locinput;
5332
5333           do_ifmatch:
5334             ST.me = scan;
5335             ST.logical = logical;
5336             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5337
5338             /* execute body of (?...A) */
5339             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5340             /* NOTREACHED */
5341
5342         case IFMATCH_A_fail: /* body of (?...A) failed */
5343             ST.wanted = !ST.wanted;
5344             /* FALL THROUGH */
5345
5346         case IFMATCH_A: /* body of (?...A) succeeded */
5347             if (ST.logical) {
5348                 sw = cBOOL(ST.wanted);
5349             }
5350             else if (!ST.wanted)
5351                 sayNO;
5352
5353             if (OP(ST.me) == SUSPEND)
5354                 locinput = PL_reginput;
5355             else {
5356                 locinput = PL_reginput = st->locinput;
5357                 nextchr = UCHARAT(locinput);
5358             }
5359             scan = ST.me + ARG(ST.me);
5360             if (scan == ST.me)
5361                 scan = NULL;
5362             continue; /* execute B */
5363
5364 #undef ST
5365
5366         case LONGJMP:
5367             next = scan + ARG(scan);
5368             if (next == scan)
5369                 next = NULL;
5370             break;
5371         case COMMIT:
5372             reginfo->cutpoint = PL_regeol;
5373             /* FALLTHROUGH */
5374         case PRUNE:
5375             PL_reginput = locinput;
5376             if (!scan->flags)
5377                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5378             PUSH_STATE_GOTO(COMMIT_next,next);
5379             /* NOTREACHED */
5380         case COMMIT_next_fail:
5381             no_final = 1;
5382             /* FALLTHROUGH */
5383         case OPFAIL:
5384             sayNO;
5385             /* NOTREACHED */
5386
5387 #define ST st->u.mark
5388         case MARKPOINT:
5389             ST.prev_mark = mark_state;
5390             ST.mark_name = sv_commit = sv_yes_mark
5391                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5392             mark_state = st;
5393             ST.mark_loc = PL_reginput = locinput;
5394             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5395             /* NOTREACHED */
5396         case MARKPOINT_next:
5397             mark_state = ST.prev_mark;
5398             sayYES;
5399             /* NOTREACHED */
5400         case MARKPOINT_next_fail:
5401             if (popmark && sv_eq(ST.mark_name,popmark))
5402             {
5403                 if (ST.mark_loc > startpoint)
5404                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5405                 popmark = NULL; /* we found our mark */
5406                 sv_commit = ST.mark_name;
5407
5408                 DEBUG_EXECUTE_r({
5409                         PerlIO_printf(Perl_debug_log,
5410                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5411                             REPORT_CODE_OFF+depth*2, "",
5412                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5413                 });
5414             }
5415             mark_state = ST.prev_mark;
5416             sv_yes_mark = mark_state ?
5417                 mark_state->u.mark.mark_name : NULL;
5418             sayNO;
5419             /* NOTREACHED */
5420         case SKIP:
5421             PL_reginput = locinput;
5422             if (scan->flags) {
5423                 /* (*SKIP) : if we fail we cut here*/
5424                 ST.mark_name = NULL;
5425                 ST.mark_loc = locinput;
5426                 PUSH_STATE_GOTO(SKIP_next,next);
5427             } else {
5428                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5429                    otherwise do nothing.  Meaning we need to scan
5430                  */
5431                 regmatch_state *cur = mark_state;
5432                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5433
5434                 while (cur) {
5435                     if ( sv_eq( cur->u.mark.mark_name,
5436                                 find ) )
5437                     {
5438                         ST.mark_name = find;
5439                         PUSH_STATE_GOTO( SKIP_next, next );
5440                     }
5441                     cur = cur->u.mark.prev_mark;
5442                 }
5443             }
5444             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5445             break;
5446         case SKIP_next_fail:
5447             if (ST.mark_name) {
5448                 /* (*CUT:NAME) - Set up to search for the name as we
5449                    collapse the stack*/
5450                 popmark = ST.mark_name;
5451             } else {
5452                 /* (*CUT) - No name, we cut here.*/
5453                 if (ST.mark_loc > startpoint)
5454                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5455                 /* but we set sv_commit to latest mark_name if there
5456                    is one so they can test to see how things lead to this
5457                    cut */
5458                 if (mark_state)
5459                     sv_commit=mark_state->u.mark.mark_name;
5460             }
5461             no_final = 1;
5462             sayNO;
5463             /* NOTREACHED */
5464 #undef ST
5465         case FOLDCHAR:
5466             n = ARG(scan);
5467             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5468                 locinput += ln;
5469             } else if ( 0xDF == n && !utf8_target && !UTF_PATTERN ) {
5470                 sayNO;
5471             } else  {
5472                 U8 folded[UTF8_MAXBYTES_CASE+1];
5473                 STRLEN foldlen;
5474                 const char * const l = locinput;
5475                 char *e = PL_regeol;
5476                 to_uni_fold(n, folded, &foldlen);
5477
5478                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5479                                l, &e, 0,  utf8_target)) {
5480                         sayNO;
5481                 }
5482                 locinput = e;
5483             }
5484             nextchr = UCHARAT(locinput);
5485             break;
5486         case LNBREAK:
5487             if ((n=is_LNBREAK(locinput,utf8_target))) {
5488                 locinput += n;
5489                 nextchr = UCHARAT(locinput);
5490             } else
5491                 sayNO;
5492             break;
5493
5494 #define CASE_CLASS(nAmE)                              \
5495         case nAmE:                                    \
5496             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5497                 locinput += n;                        \
5498                 nextchr = UCHARAT(locinput);          \
5499             } else                                    \
5500                 sayNO;                                \
5501             break;                                    \
5502         case N##nAmE:                                 \
5503             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5504                 sayNO;                                \
5505             } else {                                  \
5506                 locinput += UTF8SKIP(locinput);       \
5507                 nextchr = UCHARAT(locinput);          \
5508             }                                         \
5509             break
5510
5511         CASE_CLASS(VERTWS);
5512         CASE_CLASS(HORIZWS);
5513 #undef CASE_CLASS
5514
5515         default:
5516             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5517                           PTR2UV(scan), OP(scan));
5518             Perl_croak(aTHX_ "regexp memory corruption");
5519
5520         } /* end switch */
5521
5522         /* switch break jumps here */
5523         scan = next; /* prepare to execute the next op and ... */
5524         continue;    /* ... jump back to the top, reusing st */
5525         /* NOTREACHED */
5526
5527       push_yes_state:
5528         /* push a state that backtracks on success */
5529         st->u.yes.prev_yes_state = yes_state;
5530         yes_state = st;
5531         /* FALL THROUGH */
5532       push_state:
5533         /* push a new regex state, then continue at scan  */
5534         {
5535             regmatch_state *newst;
5536
5537             DEBUG_STACK_r({
5538                 regmatch_state *cur = st;
5539                 regmatch_state *curyes = yes_state;
5540                 int curd = depth;
5541                 regmatch_slab *slab = PL_regmatch_slab;
5542                 for (;curd > -1;cur--,curd--) {
5543                     if (cur < SLAB_FIRST(slab)) {
5544                         slab = slab->prev;
5545                         cur = SLAB_LAST(slab);
5546                     }
5547                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5548                         REPORT_CODE_OFF + 2 + depth * 2,"",
5549                         curd, PL_reg_name[cur->resume_state],
5550                         (curyes == cur) ? "yes" : ""
5551                     );
5552                     if (curyes == cur)
5553                         curyes = cur->u.yes.prev_yes_state;
5554                 }
5555             } else
5556                 DEBUG_STATE_pp("push")
5557             );
5558             depth++;
5559             st->locinput = locinput;
5560             newst = st+1;
5561             if (newst >  SLAB_LAST(PL_regmatch_slab))
5562                 newst = S_push_slab(aTHX);
5563             PL_regmatch_state = newst;
5564
5565             locinput = PL_reginput;
5566             nextchr = UCHARAT(locinput);
5567             st = newst;
5568             continue;
5569             /* NOTREACHED */
5570         }
5571     }
5572
5573     /*
5574     * We get here only if there's trouble -- normally "case END" is
5575     * the terminating point.
5576     */
5577     Perl_croak(aTHX_ "corrupted regexp pointers");
5578     /*NOTREACHED*/
5579     sayNO;
5580
5581 yes:
5582     if (yes_state) {
5583         /* we have successfully completed a subexpression, but we must now
5584          * pop to the state marked by yes_state and continue from there */
5585         assert(st != yes_state);
5586 #ifdef DEBUGGING
5587         while (st != yes_state) {
5588             st--;
5589             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5590                 PL_regmatch_slab = PL_regmatch_slab->prev;
5591                 st = SLAB_LAST(PL_regmatch_slab);
5592             }
5593             DEBUG_STATE_r({
5594                 if (no_final) {
5595                     DEBUG_STATE_pp("pop (no final)");
5596                 } else {
5597                     DEBUG_STATE_pp("pop (yes)");
5598                 }
5599             });
5600             depth--;
5601         }
5602 #else
5603         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5604             || yes_state > SLAB_LAST(PL_regmatch_slab))
5605         {
5606             /* not in this slab, pop slab */
5607             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5608             PL_regmatch_slab = PL_regmatch_slab->prev;
5609             st = SLAB_LAST(PL_regmatch_slab);
5610         }
5611         depth -= (st - yes_state);
5612 #endif
5613         st = yes_state;
5614         yes_state = st->u.yes.prev_yes_state;
5615         PL_regmatch_state = st;
5616
5617         if (no_final) {
5618             locinput= st->locinput;
5619             nextchr = UCHARAT(locinput);
5620         }
5621         state_num = st->resume_state + no_final;
5622         goto reenter_switch;
5623     }
5624
5625     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5626                           PL_colors[4], PL_colors[5]));
5627
5628     if (PL_reg_eval_set) {
5629         /* each successfully executed (?{...}) block does the equivalent of
5630          *   local $^R = do {...}
5631          * When popping the save stack, all these locals would be undone;
5632          * bypass this by setting the outermost saved $^R to the latest
5633          * value */
5634         if (oreplsv != GvSV(PL_replgv))
5635             sv_setsv(oreplsv, GvSV(PL_replgv));
5636     }
5637     result = 1;
5638     goto final_exit;
5639
5640 no:
5641     DEBUG_EXECUTE_r(
5642         PerlIO_printf(Perl_debug_log,
5643             "%*s  %sfailed...%s\n",
5644             REPORT_CODE_OFF+depth*2, "",
5645             PL_colors[4], PL_colors[5])
5646         );
5647
5648 no_silent:
5649     if (no_final) {
5650         if (yes_state) {
5651             goto yes;
5652         } else {
5653             goto final_exit;
5654         }
5655     }
5656     if (depth) {
5657         /* there's a previous state to backtrack to */
5658         st--;
5659         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5660             PL_regmatch_slab = PL_regmatch_slab->prev;
5661             st = SLAB_LAST(PL_regmatch_slab);
5662         }
5663         PL_regmatch_state = st;
5664         locinput= st->locinput;
5665         nextchr = UCHARAT(locinput);
5666
5667         DEBUG_STATE_pp("pop");
5668         depth--;
5669         if (yes_state == st)
5670             yes_state = st->u.yes.prev_yes_state;
5671
5672         state_num = st->resume_state + 1; /* failure = success + 1 */
5673         goto reenter_switch;
5674     }
5675     result = 0;
5676
5677   final_exit:
5678     if (rex->intflags & PREGf_VERBARG_SEEN) {
5679         SV *sv_err = get_sv("REGERROR", 1);
5680         SV *sv_mrk = get_sv("REGMARK", 1);
5681         if (result) {
5682             sv_commit = &PL_sv_no;
5683             if (!sv_yes_mark)
5684                 sv_yes_mark = &PL_sv_yes;
5685         } else {
5686             if (!sv_commit)
5687                 sv_commit = &PL_sv_yes;
5688             sv_yes_mark = &PL_sv_no;
5689         }
5690         sv_setsv(sv_err, sv_commit);
5691         sv_setsv(sv_mrk, sv_yes_mark);
5692     }
5693
5694     /* clean up; in particular, free all slabs above current one */
5695     LEAVE_SCOPE(oldsave);
5696
5697     return result;
5698 }
5699
5700 /*
5701  - regrepeat - repeatedly match something simple, report how many
5702  */
5703 /*
5704  * [This routine now assumes that it will only match on things of length 1.
5705  * That was true before, but now we assume scan - reginput is the count,
5706  * rather than incrementing count on every character.  [Er, except utf8.]]
5707  */
5708 STATIC I32
5709 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5710 {
5711     dVAR;
5712     register char *scan;
5713     register I32 c;
5714     register char *loceol = PL_regeol;
5715     register I32 hardcount = 0;
5716     register bool utf8_target = PL_reg_match_utf8;
5717 #ifndef DEBUGGING
5718     PERL_UNUSED_ARG(depth);
5719 #endif
5720
5721     PERL_ARGS_ASSERT_REGREPEAT;
5722
5723     scan = PL_reginput;
5724     if (max == REG_INFTY)
5725         max = I32_MAX;
5726     else if (max < loceol - scan)
5727         loceol = scan + max;
5728     switch (OP(p)) {
5729     case REG_ANY:
5730         if (utf8_target) {
5731             loceol = PL_regeol;
5732             while (scan < loceol && hardcount < max && *scan != '\n') {
5733                 scan += UTF8SKIP(scan);
5734                 hardcount++;
5735             }
5736         } else {
5737             while (scan < loceol && *scan != '\n')
5738                 scan++;
5739         }
5740         break;
5741     case SANY:
5742         if (utf8_target) {
5743             loceol = PL_regeol;
5744             while (scan < loceol && hardcount < max) {
5745                 scan += UTF8SKIP(scan);
5746                 hardcount++;
5747             }
5748         }
5749         else
5750             scan = loceol;
5751         break;
5752     case CANY:
5753         scan = loceol;
5754         break;
5755     case EXACT:         /* length of string is 1 */
5756         c = (U8)*STRING(p);
5757         while (scan < loceol && UCHARAT(scan) == c)
5758             scan++;
5759         break;
5760     case EXACTF:        /* length of string is 1 */
5761         c = (U8)*STRING(p);
5762         while (scan < loceol &&
5763                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5764             scan++;
5765         break;
5766     case EXACTFL:       /* length of string is 1 */
5767         PL_reg_flags |= RF_tainted;
5768         c = (U8)*STRING(p);
5769         while (scan < loceol &&
5770                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5771             scan++;
5772         break;
5773     case ANYOF:
5774         if (utf8_target) {
5775             loceol = PL_regeol;
5776             while (hardcount < max && scan < loceol &&
5777                    reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
5778                 scan += UTF8SKIP(scan);
5779                 hardcount++;
5780             }
5781         } else {
5782             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5783                 scan++;
5784         }
5785         break;
5786     case ALNUM:
5787         if (utf8_target) {
5788             loceol = PL_regeol;
5789             LOAD_UTF8_CHARCLASS_ALNUM();
5790             while (hardcount < max && scan < loceol &&
5791                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5792             {
5793                 scan += UTF8SKIP(scan);
5794                 hardcount++;
5795             }
5796         } else if (FLAGS(p) & USE_UNI) {
5797             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
5798                 scan++;
5799             }
5800         } else {
5801             while (scan < loceol && isALNUM((U8) *scan)) {
5802                 scan++;
5803             }
5804         }
5805         break;
5806     case ALNUML:
5807         PL_reg_flags |= RF_tainted;
5808         if (utf8_target) {
5809             loceol = PL_regeol;
5810             while (hardcount < max && scan < loceol &&
5811                    isALNUM_LC_utf8((U8*)scan)) {
5812                 scan += UTF8SKIP(scan);
5813                 hardcount++;
5814             }
5815         } else {
5816             while (scan < loceol && isALNUM_LC(*scan))
5817                 scan++;
5818         }
5819         break;
5820     case NALNUM:
5821         if (utf8_target) {
5822             loceol = PL_regeol;
5823             LOAD_UTF8_CHARCLASS_ALNUM();
5824             while (hardcount < max && scan < loceol &&
5825                    !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5826             {
5827                 scan += UTF8SKIP(scan);
5828                 hardcount++;
5829             }
5830         } else if (FLAGS(p) & USE_UNI) {
5831             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
5832                 scan++;
5833             }
5834         } else {
5835             while (scan < loceol && ! isALNUM((U8) *scan)) {
5836                 scan++;
5837             }
5838         }
5839         break;
5840     case NALNUML:
5841         PL_reg_flags |= RF_tainted;
5842         if (utf8_target) {
5843             loceol = PL_regeol;
5844             while (hardcount < max && scan < loceol &&
5845                    !isALNUM_LC_utf8((U8*)scan)) {
5846                 scan += UTF8SKIP(scan);
5847                 hardcount++;
5848             }
5849         } else {
5850             while (scan < loceol && !isALNUM_LC(*scan))
5851                 scan++;
5852         }
5853         break;
5854     case SPACE:
5855         if (utf8_target) {
5856             loceol = PL_regeol;
5857             LOAD_UTF8_CHARCLASS_SPACE();
5858             while (hardcount < max && scan < loceol &&
5859                    (*scan == ' ' ||
5860                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5861             {
5862                 scan += UTF8SKIP(scan);
5863                 hardcount++;
5864             }
5865         } else if (FLAGS(p) & USE_UNI) {
5866             while (scan < loceol && isSPACE_L1((U8) *scan)) {
5867                 scan++;
5868             }
5869         } else {
5870             while (scan < loceol && isSPACE((U8) *scan))
5871                 scan++;
5872         }
5873         break;
5874     case SPACEL:
5875         PL_reg_flags |= RF_tainted;
5876         if (utf8_target) {
5877             loceol = PL_regeol;
5878             while (hardcount < max && scan < loceol &&
5879                    (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5880                 scan += UTF8SKIP(scan);
5881                 hardcount++;
5882             }
5883         } else {
5884             while (scan < loceol && isSPACE_LC(*scan))
5885                 scan++;
5886         }
5887         break;
5888     case NSPACE:
5889         if (utf8_target) {
5890             loceol = PL_regeol;
5891             LOAD_UTF8_CHARCLASS_SPACE();
5892             while (hardcount < max && scan < loceol &&
5893                    !(*scan == ' ' ||
5894                      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5895             {
5896                 scan += UTF8SKIP(scan);
5897                 hardcount++;
5898             }
5899         } else if (FLAGS(p) & USE_UNI) {
5900             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
5901                 scan++;
5902             }
5903         } else {
5904             while (scan < loceol && ! isSPACE((U8) *scan)) {
5905                 scan++;
5906             }
5907         }
5908         break;
5909     case NSPACEL:
5910         PL_reg_flags |= RF_tainted;
5911         if (utf8_target) {
5912             loceol = PL_regeol;
5913             while (hardcount < max && scan < loceol &&
5914                    !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5915                 scan += UTF8SKIP(scan);
5916                 hardcount++;
5917             }
5918         } else {
5919             while (scan < loceol && !isSPACE_LC(*scan))
5920                 scan++;
5921         }
5922         break;
5923     case DIGIT:
5924         if (utf8_target) {
5925             loceol = PL_regeol;
5926             LOAD_UTF8_CHARCLASS_DIGIT();
5927             while (hardcount < max && scan < loceol &&
5928                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
5929                 scan += UTF8SKIP(scan);
5930                 hardcount++;
5931             }
5932         } else {
5933             while (scan < loceol && isDIGIT(*scan))
5934                 scan++;
5935         }
5936         break;
5937     case NDIGIT:
5938         if (utf8_target) {
5939             loceol = PL_regeol;
5940             LOAD_UTF8_CHARCLASS_DIGIT();
5941             while (hardcount < max && scan < loceol &&
5942                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
5943                 scan += UTF8SKIP(scan);
5944                 hardcount++;
5945             }
5946         } else {
5947             while (scan < loceol && !isDIGIT(*scan))
5948                 scan++;
5949         }
5950     case LNBREAK:
5951         if (utf8_target) {
5952             loceol = PL_regeol;
5953             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5954                 scan += c;
5955                 hardcount++;
5956             }
5957         } else {
5958             /*
5959               LNBREAK can match two latin chars, which is ok,
5960               because we have a null terminated string, but we
5961               have to use hardcount in this situation
5962             */
5963             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
5964                 scan+=c;
5965                 hardcount++;
5966             }
5967         }
5968         break;
5969     case HORIZWS:
5970         if (utf8_target) {
5971             loceol = PL_regeol;
5972             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5973                 scan += c;
5974                 hardcount++;
5975             }
5976         } else {
5977             while (scan < loceol && is_HORIZWS_latin1(scan))
5978                 scan++;
5979         }
5980         break;
5981     case NHORIZWS:
5982         if (utf8_target) {
5983             loceol = PL_regeol;
5984             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5985                 scan += UTF8SKIP(scan);
5986                 hardcount++;
5987             }
5988         } else {
5989             while (scan < loceol && !is_HORIZWS_latin1(scan))
5990                 scan++;
5991
5992         }
5993         break;
5994     case VERTWS:
5995         if (utf8_target) {
5996             loceol = PL_regeol;
5997             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5998                 scan += c;
5999                 hardcount++;
6000             }
6001         } else {
6002             while (scan < loceol && is_VERTWS_latin1(scan))
6003                 scan++;
6004
6005         }
6006         break;
6007     case NVERTWS:
6008         if (utf8_target) {
6009             loceol = PL_regeol;
6010             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6011                 scan += UTF8SKIP(scan);
6012                 hardcount++;
6013             }
6014         } else {
6015             while (scan < loceol && !is_VERTWS_latin1(scan))
6016                 scan++;
6017
6018         }
6019         break;
6020
6021     default:            /* Called on something of 0 width. */
6022         break;          /* So match right here or not at all. */
6023     }
6024
6025     if (hardcount)
6026         c = hardcount;
6027     else
6028         c = scan - PL_reginput;
6029     PL_reginput = scan;
6030
6031     DEBUG_r({
6032         GET_RE_DEBUG_FLAGS_DECL;
6033         DEBUG_EXECUTE_r({
6034             SV * const prop = sv_newmortal();
6035             regprop(prog, prop, p);
6036             PerlIO_printf(Perl_debug_log,
6037                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6038                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6039         });
6040     });
6041
6042     return(c);
6043 }
6044
6045
6046 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6047 /*
6048 - regclass_swash - prepare the utf8 swash
6049 */
6050
6051 SV *
6052 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6053 {
6054     dVAR;
6055     SV *sw  = NULL;
6056     SV *si  = NULL;
6057     SV *alt = NULL;
6058     RXi_GET_DECL(prog,progi);
6059     const struct reg_data * const data = prog ? progi->data : NULL;
6060
6061     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6062
6063     if (data && data->count) {
6064         const U32 n = ARG(node);
6065
6066         if (data->what[n] == 's') {
6067             SV * const rv = MUTABLE_SV(data->data[n]);
6068             AV * const av = MUTABLE_AV(SvRV(rv));
6069             SV **const ary = AvARRAY(av);
6070             SV **a, **b;
6071
6072             /* See the end of regcomp.c:S_regclass() for
6073              * documentation of these array elements. */
6074
6075             si = *ary;
6076             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6077             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6078
6079             if (a)
6080                 sw = *a;
6081             else if (si && doinit) {
6082                 sw = swash_init("utf8", "", si, 1, 0);
6083                 (void)av_store(av, 1, sw);
6084             }
6085             if (b)
6086                 alt = *b;
6087         }
6088     }
6089
6090     if (listsvp)
6091         *listsvp = si;
6092     if (altsvp)
6093         *altsvp  = alt;
6094
6095     return sw;
6096 }
6097 #endif
6098
6099 /*
6100  - reginclass - determine if a character falls into a character class
6101
6102   The n is the ANYOF regnode, the p is the target string, lenp
6103   is pointer to the maximum length of how far to go in the p
6104   (if the lenp is zero, UTF8SKIP(p) is used),
6105   utf8_target tells whether the target string is in UTF-8.
6106
6107  */
6108
6109 STATIC bool
6110 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool utf8_target)
6111 {
6112     dVAR;
6113     const char flags = ANYOF_FLAGS(n);
6114     bool match = FALSE;
6115     UV c = *p;
6116     STRLEN len = 0;
6117     STRLEN plen;
6118
6119     PERL_ARGS_ASSERT_REGINCLASS;
6120
6121     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6122         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
6123                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6124                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6125                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6126                  * UTF8_ALLOW_FFFF */
6127         if (len == (STRLEN)-1)
6128             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6129     }
6130
6131     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
6132     if (utf8_target || (flags & ANYOF_UNICODE)) {
6133         if (lenp)
6134             *lenp = 0;
6135         if (utf8_target && !ANYOF_RUNTIME(n)) {
6136             if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
6137                 match = TRUE;
6138         }
6139         if (!match && utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256)
6140             match = TRUE;
6141         if (!match) {
6142             AV *av;
6143             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6144
6145             if (sw) {
6146                 U8 * utf8_p;
6147                 if (utf8_target) {
6148                     utf8_p = (U8 *) p;
6149                 } else {
6150                     STRLEN len = 1;
6151                     utf8_p = bytes_to_utf8(p, &len);
6152                 }
6153                 if (swash_fetch(sw, utf8_p, 1))
6154                     match = TRUE;
6155                 else if (flags & ANYOF_FOLD) {
6156                     if (!match && lenp && av) {
6157                         I32 i;
6158                         for (i = 0; i <= av_len(av); i++) {
6159                             SV* const sv = *av_fetch(av, i, FALSE);
6160                             STRLEN len;
6161                             const char * const s = SvPV_const(sv, len);
6162                             if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
6163                                 *lenp = len;
6164                                 match = TRUE;
6165                                 break;
6166                             }
6167                         }
6168                     }
6169                     if (!match) {
6170                         U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
6171
6172                         STRLEN tmplen;
6173                         to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6174                         if (swash_fetch(sw, tmpbuf, 1))
6175                             match = TRUE;
6176                     }
6177                 }
6178
6179                 /* If we allocated a string above, free it */
6180                 if (! utf8_target) Safefree(utf8_p);
6181             }
6182         }
6183         if (match && lenp && *lenp == 0)
6184             *lenp = UNISKIP(NATIVE_TO_UNI(c));
6185     }
6186     if (!match && c < 256) {
6187         if (ANYOF_BITMAP_TEST(n, c))
6188             match = TRUE;
6189         else if (flags & ANYOF_FOLD) {
6190             U8 f;
6191
6192             if (flags & ANYOF_LOCALE) {
6193                 PL_reg_flags |= RF_tainted;
6194                 f = PL_fold_locale[c];
6195             }
6196             else
6197                 f = PL_fold[c];
6198             if (f != c && ANYOF_BITMAP_TEST(n, f))
6199                 match = TRUE;
6200         }
6201
6202         if (!match && (flags & ANYOF_CLASS)) {
6203             PL_reg_flags |= RF_tainted;
6204             if (
6205                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6206                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6207                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6208                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6209                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6210                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6211                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6212                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6213                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6214                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6215                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6216                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6217                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6218                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6219                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6220                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6221                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6222                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6223                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6224                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6225                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6226                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6227                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6228                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6229                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6230                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6231                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6232                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6233                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6234                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6235                 ) /* How's that for a conditional? */
6236             {
6237                 match = TRUE;
6238             }
6239         }
6240     }
6241
6242     return (flags & ANYOF_INVERT) ? !match : match;
6243 }
6244
6245 STATIC U8 *
6246 S_reghop3(U8 *s, I32 off, const U8* lim)
6247 {
6248     dVAR;
6249
6250     PERL_ARGS_ASSERT_REGHOP3;
6251
6252     if (off >= 0) {
6253         while (off-- && s < lim) {
6254             /* XXX could check well-formedness here */
6255             s += UTF8SKIP(s);
6256         }
6257     }
6258     else {
6259         while (off++ && s > lim) {
6260             s--;
6261             if (UTF8_IS_CONTINUED(*s)) {
6262                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6263                     s--;
6264             }
6265             /* XXX could check well-formedness here */
6266         }
6267     }
6268     return s;
6269 }
6270
6271 #ifdef XXX_dmq
6272 /* there are a bunch of places where we use two reghop3's that should
6273    be replaced with this routine. but since thats not done yet
6274    we ifdef it out - dmq
6275 */
6276 STATIC U8 *
6277 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6278 {
6279     dVAR;
6280
6281     PERL_ARGS_ASSERT_REGHOP4;
6282
6283     if (off >= 0) {
6284         while (off-- && s < rlim) {
6285             /* XXX could check well-formedness here */
6286             s += UTF8SKIP(s);
6287         }
6288     }
6289     else {
6290         while (off++ && s > llim) {
6291             s--;
6292             if (UTF8_IS_CONTINUED(*s)) {
6293                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6294                     s--;
6295             }
6296             /* XXX could check well-formedness here */
6297         }
6298     }
6299     return s;
6300 }
6301 #endif
6302
6303 STATIC U8 *
6304 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6305 {
6306     dVAR;
6307
6308     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6309
6310     if (off >= 0) {
6311         while (off-- && s < lim) {
6312             /* XXX could check well-formedness here */
6313             s += UTF8SKIP(s);
6314         }
6315         if (off >= 0)
6316             return NULL;
6317     }
6318     else {
6319         while (off++ && s > lim) {
6320             s--;
6321             if (UTF8_IS_CONTINUED(*s)) {
6322                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6323                     s--;
6324             }
6325             /* XXX could check well-formedness here */
6326         }
6327         if (off <= 0)
6328             return NULL;
6329     }
6330     return s;
6331 }
6332
6333 static void
6334 restore_pos(pTHX_ void *arg)
6335 {
6336     dVAR;
6337     regexp * const rex = (regexp *)arg;
6338     if (PL_reg_eval_set) {
6339         if (PL_reg_oldsaved) {
6340             rex->subbeg = PL_reg_oldsaved;
6341             rex->sublen = PL_reg_oldsavedlen;
6342 #ifdef PERL_OLD_COPY_ON_WRITE
6343             rex->saved_copy = PL_nrs;
6344 #endif
6345             RXp_MATCH_COPIED_on(rex);
6346         }
6347         PL_reg_magic->mg_len = PL_reg_oldpos;
6348         PL_reg_eval_set = 0;
6349         PL_curpm = PL_reg_oldcurpm;
6350     }
6351 }
6352
6353 STATIC void
6354 S_to_utf8_substr(pTHX_ register regexp *prog)
6355 {
6356     int i = 1;
6357
6358     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6359
6360     do {
6361         if (prog->substrs->data[i].substr
6362             && !prog->substrs->data[i].utf8_substr) {
6363             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6364             prog->substrs->data[i].utf8_substr = sv;
6365             sv_utf8_upgrade(sv);
6366             if (SvVALID(prog->substrs->data[i].substr)) {
6367                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6368                 if (flags & FBMcf_TAIL) {
6369                     /* Trim the trailing \n that fbm_compile added last
6370                        time.  */
6371                     SvCUR_set(sv, SvCUR(sv) - 1);
6372                     /* Whilst this makes the SV technically "invalid" (as its
6373                        buffer is no longer followed by "\0") when fbm_compile()
6374                        adds the "\n" back, a "\0" is restored.  */
6375                 }
6376                 fbm_compile(sv, flags);
6377             }
6378             if (prog->substrs->data[i].substr == prog->check_substr)
6379                 prog->check_utf8 = sv;
6380         }
6381     } while (i--);
6382 }
6383
6384 STATIC void
6385 S_to_byte_substr(pTHX_ register regexp *prog)
6386 {
6387     dVAR;
6388     int i = 1;
6389
6390     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6391
6392     do {
6393         if (prog->substrs->data[i].utf8_substr
6394             && !prog->substrs->data[i].substr) {
6395             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6396             if (sv_utf8_downgrade(sv, TRUE)) {
6397                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6398                     const U8 flags
6399                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6400                     if (flags & FBMcf_TAIL) {
6401                         /* Trim the trailing \n that fbm_compile added last
6402                            time.  */
6403                         SvCUR_set(sv, SvCUR(sv) - 1);
6404                     }
6405                     fbm_compile(sv, flags);
6406                 }
6407             } else {
6408                 SvREFCNT_dec(sv);
6409                 sv = &PL_sv_undef;
6410             }
6411             prog->substrs->data[i].substr = sv;
6412             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6413                 prog->check_substr = sv;
6414         }
6415     } while (i--);
6416 }
6417
6418 /*
6419  * Local variables:
6420  * c-indentation-style: bsd
6421  * c-basic-offset: 4
6422  * indent-tabs-mode: t
6423  * End:
6424  *
6425  * ex: set ts=8 sts=4 sw=4 noet:
6426  */