src/5013006/orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1               /* tainted information used? */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
  98
  99 /*
 100  * Forwards.
 101  */
 102
 103 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 105
 106 #define HOPc(pos,off) \
 107         (char *)(PL_reg_match_utf8 \
 108             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 109             : (U8*)(pos + off))
 110 #define HOPBACKc(pos, off) \
 111         (char*)(PL_reg_match_utf8\
 112             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 113             : (pos - off >= PL_bostr)           \
 114                 ? (U8*)pos - off                \
 115                 : NULL)
 116
 117 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 118 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 119
 120 /* these are unrolled below in the CCC_TRY_XXX defined */
 121 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 122     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 123
 124 /* Doesn't do an assert to verify that is correct */
 125 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 127
 128 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 129 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 130 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 131
 132 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 133         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 134         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 135         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 136             * assert should likely and hopefully fail on an EBCDIC machine */ \
 137         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 138                                                                             \
 139         /* No asserts are done for these, in case called on an early        \
 140             * Unicode version in which they map to nothing */               \
 141         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 142         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 143         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 144         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 148
 149 /*
 150    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 151    so that it is possible to override the option here without having to
 152    rebuild the entire core. as we are required to do if we change regcomp.h
 153    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 154 */
 155 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 156 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 157 #endif
 158
 159 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 160 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 161 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 162 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 163 #define RE_utf8_perl_word   PL_utf8_alnum
 164 #define RE_utf8_perl_space  PL_utf8_space
 165 #define RE_utf8_posix_digit PL_utf8_digit
 166 #define perl_word  alnum
 167 #define perl_space space
 168 #define posix_digit digit
 169 #else
 170 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 171 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 172 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 173 #define RE_utf8_perl_word   PL_utf8_perl_word
 174 #define RE_utf8_perl_space  PL_utf8_perl_space
 175 #define RE_utf8_posix_digit PL_utf8_posix_digit
 176 #endif
 177
 178
 179 #define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
 180         case NAMEL:                                                         \
 181             PL_reg_flags |= RF_tainted;                                     \
 182             /* FALL THROUGH */                                              \
 183         case NAME:                                                          \
 184             if (!nextchr)                                                   \
 185                 sayNO;                                                      \
 186             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
 187                 if (!CAT2(PL_utf8_,CLASS)) {                                \
 188                     bool ok;                                                \
 189                     ENTER;                                                  \
 190                     save_re_context();                                      \
 191                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
 192                     assert(ok);                                             \
 193                     LEAVE;                                                  \
 194                 }                                                           \
 195                 if (!(OP(scan) == NAME                                      \
 196                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 197                     : LCFUNC_utf8((U8*)locinput)))                          \
 198                 {                                                           \
 199                     sayNO;                                                  \
 200                 }                                                           \
 201                 locinput += PL_utf8skip[nextchr];                           \
 202                 nextchr = UCHARAT(locinput);                                \
 203                 break;                                                      \
 204             }                                                               \
 205             /* Drops through to the macro that calls this one */
 206
 207 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
 208     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
 209             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
 210                 sayNO;                                                      \
 211             nextchr = UCHARAT(++locinput);                                  \
 212             break
 213
 214 /* Almost identical to the above, but has a case for a node that matches chars
 215  * between 128 and 255 using Unicode (latin1) semantics. */
 216 #define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 217     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 218             if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 219                 sayNO;                                                       \
 220             nextchr = UCHARAT(++locinput);                                   \
 221             break
 222
 223 #define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
 224         case NAMEL:                                                          \
 225             PL_reg_flags |= RF_tainted;                                      \
 226             /* FALL THROUGH */                                               \
 227         case NAME :                                                          \
 228             if (!nextchr && locinput >= PL_regeol)                           \
 229                 sayNO;                                                       \
 230             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
 231                 if (!CAT2(PL_utf8_,CLASS)) {                                 \
 232                     bool ok;                                                 \
 233                     ENTER;                                                   \
 234                     save_re_context();                                       \
 235                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
 236                     assert(ok);                                              \
 237                     LEAVE;                                                   \
 238                 }                                                            \
 239                 if ((OP(scan) == NAME                                        \
 240                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 241                     : LCFUNC_utf8((U8*)locinput)))                           \
 242                 {                                                            \
 243                     sayNO;                                                   \
 244                 }                                                            \
 245                 locinput += PL_utf8skip[nextchr];                            \
 246                 nextchr = UCHARAT(locinput);                                 \
 247                 break;                                                       \
 248             }
 249
 250 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
 251     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 252             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
 253                 sayNO;                                                       \
 254             nextchr = UCHARAT(++locinput);                                   \
 255             break
 256
 257
 258 #define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 259     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
 260             if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 261                 sayNO;                                                       \
 262             nextchr = UCHARAT(++locinput);                                   \
 263             break
 264
 265
 266
 267 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 268
 269 /* for use after a quantifier and before an EXACT-like node -- japhy */
 270 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 271  *
 272  * NOTE that *nothing* that affects backtracking should be in here, specifically
 273  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 274  * node that is in between two EXACT like nodes when ascertaining what the required
 275  * "follow" character is. This should probably be moved to regex compile time
 276  * although it may be done at run time beause of the REF possibility - more
 277  * investigation required. -- demerphq
 278 */
 279 #define JUMPABLE(rn) (      \
 280     OP(rn) == OPEN ||       \
 281     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 282     OP(rn) == EVAL ||   \
 283     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 284     OP(rn) == PLUS || OP(rn) == MINMOD || \
 285     OP(rn) == KEEPS || \
 286     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 287 )
 288 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 289
 290 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 291
 292 #if 0
 293 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 294    we don't need this definition. */
 295 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 296 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 297 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 298
 299 #else
 300 /* ... so we use this as its faster. */
 301 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 302 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 303 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 304
 305 #endif
 306
 307 /*
 308   Search for mandatory following text node; for lookahead, the text must
 309   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 310 */
 311 #define FIND_NEXT_IMPT(rn) STMT_START { \
 312     while (JUMPABLE(rn)) { \
 313         const OPCODE type = OP(rn); \
 314         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 315             rn = NEXTOPER(NEXTOPER(rn)); \
 316         else if (type == PLUS) \
 317             rn = NEXTOPER(rn); \
 318         else if (type == IFMATCH) \
 319             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 320         else rn += NEXT_OFF(rn); \
 321     } \
 322 } STMT_END
 323
 324
 325 static void restore_pos(pTHX_ void *arg);
 326
 327 #define REGCP_PAREN_ELEMS 4
 328 #define REGCP_OTHER_ELEMS 5
 329 #define REGCP_FRAME_ELEMS 1
 330 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 331  * are needed for the regexp context stack bookkeeping. */
 332
 333 STATIC CHECKPOINT
 334 S_regcppush(pTHX_ I32 parenfloor)
 335 {
 336     dVAR;
 337     const int retval = PL_savestack_ix;
 338     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 339     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 340     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 341     int p;
 342     GET_RE_DEBUG_FLAGS_DECL;
 343
 344     if (paren_elems_to_push < 0)
 345         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 346
 347     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 348         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 349                    " out of range (%lu-%ld)",
 350                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 351
 352     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 353
 354     for (p = PL_regsize; p > parenfloor; p--) {
 355 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 356         SSPUSHINT(PL_regoffs[p].end);
 357         SSPUSHINT(PL_regoffs[p].start);
 358         SSPUSHPTR(PL_reg_start_tmp[p]);
 359         SSPUSHINT(p);
 360         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 361           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 362                       (UV)p, (IV)PL_regoffs[p].start,
 363                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 364                       (IV)PL_regoffs[p].end
 365         ));
 366     }
 367 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 368     SSPUSHPTR(PL_regoffs);
 369     SSPUSHINT(PL_regsize);
 370     SSPUSHINT(*PL_reglastparen);
 371     SSPUSHINT(*PL_reglastcloseparen);
 372     SSPUSHPTR(PL_reginput);
 373     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 374
 375     return retval;
 376 }
 377
 378 /* These are needed since we do not localize EVAL nodes: */
 379 #define REGCP_SET(cp)                                           \
 380     DEBUG_STATE_r(                                              \
 381             PerlIO_printf(Perl_debug_log,                       \
 382                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 383                 (IV)PL_savestack_ix));                          \
 384     cp = PL_savestack_ix
 385
 386 #define REGCP_UNWIND(cp)                                        \
 387     DEBUG_STATE_r(                                              \
 388         if (cp != PL_savestack_ix)                              \
 389             PerlIO_printf(Perl_debug_log,                       \
 390                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 391                 (IV)(cp), (IV)PL_savestack_ix));                \
 392     regcpblow(cp)
 393
 394 STATIC char *
 395 S_regcppop(pTHX_ const regexp *rex)
 396 {
 397     dVAR;
 398     UV i;
 399     char *input;
 400     GET_RE_DEBUG_FLAGS_DECL;
 401
 402     PERL_ARGS_ASSERT_REGCPPOP;
 403
 404     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 405     i = SSPOPUV;
 406     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 407     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 408     input = (char *) SSPOPPTR;
 409     *PL_reglastcloseparen = SSPOPINT;
 410     *PL_reglastparen = SSPOPINT;
 411     PL_regsize = SSPOPINT;
 412     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 413
 414     i -= REGCP_OTHER_ELEMS;
 415     /* Now restore the parentheses context. */
 416     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 417         I32 tmps;
 418         U32 paren = (U32)SSPOPINT;
 419         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 420         PL_regoffs[paren].start = SSPOPINT;
 421         tmps = SSPOPINT;
 422         if (paren <= *PL_reglastparen)
 423             PL_regoffs[paren].end = tmps;
 424         DEBUG_BUFFERS_r(
 425             PerlIO_printf(Perl_debug_log,
 426                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 427                           (UV)paren, (IV)PL_regoffs[paren].start,
 428                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 429                           (IV)PL_regoffs[paren].end,
 430                           (paren > *PL_reglastparen ? "(no)" : ""));
 431         );
 432     }
 433     DEBUG_BUFFERS_r(
 434         if (*PL_reglastparen + 1 <= rex->nparens) {
 435             PerlIO_printf(Perl_debug_log,
 436                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 437                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 438         }
 439     );
 440 #if 1
 441     /* It would seem that the similar code in regtry()
 442      * already takes care of this, and in fact it is in
 443      * a better location to since this code can #if 0-ed out
 444      * but the code in regtry() is needed or otherwise tests
 445      * requiring null fields (pat.t#187 and split.t#{13,14}
 446      * (as of patchlevel 7877)  will fail.  Then again,
 447      * this code seems to be necessary or otherwise
 448      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 449      * --jhi updated by dapm */
 450     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 451         if (i > PL_regsize)
 452             PL_regoffs[i].start = -1;
 453         PL_regoffs[i].end = -1;
 454     }
 455 #endif
 456     return input;
 457 }
 458
 459 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 460
 461 /*
 462  * pregexec and friends
 463  */
 464
 465 #ifndef PERL_IN_XSUB_RE
 466 /*
 467  - pregexec - match a regexp against a string
 468  */
 469 I32
 470 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 471          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 472 /* strend: pointer to null at end of string */
 473 /* strbeg: real beginning of string */
 474 /* minend: end of match must be >=minend after stringarg. */
 475 /* nosave: For optimizations. */
 476 {
 477     PERL_ARGS_ASSERT_PREGEXEC;
 478
 479     return
 480         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 481                       nosave ? 0 : REXEC_COPY_STR);
 482 }
 483 #endif
 484
 485 /*
 486  * Need to implement the following flags for reg_anch:
 487  *
 488  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 489  * USE_INTUIT_ML
 490  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 491  * INTUIT_AUTORITATIVE_ML
 492  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 493  * INTUIT_ONCE_ML
 494  *
 495  * Another flag for this function: SECOND_TIME (so that float substrs
 496  * with giant delta may be not rechecked).
 497  */
 498
 499 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 500
 501 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 502    Otherwise, only SvCUR(sv) is used to get strbeg. */
 503
 504 /* XXXX We assume that strpos is strbeg unless sv. */
 505
 506 /* XXXX Some places assume that there is a fixed substring.
 507         An update may be needed if optimizer marks as "INTUITable"
 508         RExen without fixed substrings.  Similarly, it is assumed that
 509         lengths of all the strings are no more than minlen, thus they
 510         cannot come from lookahead.
 511         (Or minlen should take into account lookahead.)
 512   NOTE: Some of this comment is not correct. minlen does now take account
 513   of lookahead/behind. Further research is required. -- demerphq
 514
 515 */
 516
 517 /* A failure to find a constant substring means that there is no need to make
 518    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 519    finding a substring too deep into the string means that less calls to
 520    regtry() should be needed.
 521
 522    REx compiler's optimizer found 4 possible hints:
 523         a) Anchored substring;
 524         b) Fixed substring;
 525         c) Whether we are anchored (beginning-of-line or \G);
 526         d) First node (of those at offset 0) which may distingush positions;
 527    We use a)b)d) and multiline-part of c), and try to find a position in the
 528    string which does not contradict any of them.
 529  */
 530
 531 /* Most of decisions we do here should have been done at compile time.
 532    The nodes of the REx which we used for the search should have been
 533    deleted from the finite automaton. */
 534
 535 char *
 536 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 537                      char *strend, const U32 flags, re_scream_pos_data *data)
 538 {
 539     dVAR;
 540     struct regexp *const prog = (struct regexp *)SvANY(rx);
 541     register I32 start_shift = 0;
 542     /* Should be nonnegative! */
 543     register I32 end_shift   = 0;
 544     register char *s;
 545     register SV *check;
 546     char *strbeg;
 547     char *t;
 548     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 549     I32 ml_anch;
 550     register char *other_last = NULL;   /* other substr checked before this */
 551     char *check_at = NULL;              /* check substr found at this pos */
 552     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 553     RXi_GET_DECL(prog,progi);
 554 #ifdef DEBUGGING
 555     const char * const i_strpos = strpos;
 556 #endif
 557     GET_RE_DEBUG_FLAGS_DECL;
 558
 559     PERL_ARGS_ASSERT_RE_INTUIT_START;
 560
 561     RX_MATCH_UTF8_set(rx,utf8_target);
 562
 563     if (RX_UTF8(rx)) {
 564         PL_reg_flags |= RF_utf8;
 565     }
 566     DEBUG_EXECUTE_r(
 567         debug_start_match(rx, utf8_target, strpos, strend,
 568             sv ? "Guessing start of match in sv for"
 569                : "Guessing start of match in string for");
 570               );
 571
 572     /* CHR_DIST() would be more correct here but it makes things slow. */
 573     if (prog->minlen > strend - strpos) {
 574         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 575                               "String too short... [re_intuit_start]\n"));
 576         goto fail;
 577     }
 578
 579     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 580     PL_regeol = strend;
 581     if (utf8_target) {
 582         if (!prog->check_utf8 && prog->check_substr)
 583             to_utf8_substr(prog);
 584         check = prog->check_utf8;
 585     } else {
 586         if (!prog->check_substr && prog->check_utf8)
 587             to_byte_substr(prog);
 588         check = prog->check_substr;
 589     }
 590     if (check == &PL_sv_undef) {
 591         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 592                 "Non-utf8 string cannot match utf8 check string\n"));
 593         goto fail;
 594     }
 595     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 596         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 597                      || ( (prog->extflags & RXf_ANCH_BOL)
 598                           && !multiline ) );    /* Check after \n? */
 599
 600         if (!ml_anch) {
 601           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 602                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 603                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 604                && sv && !SvROK(sv)
 605                && (strpos != strbeg)) {
 606               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 607               goto fail;
 608           }
 609           if (prog->check_offset_min == prog->check_offset_max &&
 610               !(prog->extflags & RXf_CANY_SEEN)) {
 611             /* Substring at constant offset from beg-of-str... */
 612             I32 slen;
 613
 614             s = HOP3c(strpos, prog->check_offset_min, strend);
 615
 616             if (SvTAIL(check)) {
 617                 slen = SvCUR(check);    /* >= 1 */
 618
 619                 if ( strend - s > slen || strend - s < slen - 1
 620                      || (strend - s == slen && strend[-1] != '\n')) {
 621                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 622                     goto fail_finish;
 623                 }
 624                 /* Now should match s[0..slen-2] */
 625                 slen--;
 626                 if (slen && (*SvPVX_const(check) != *s
 627                              || (slen > 1
 628                                  && memNE(SvPVX_const(check), s, slen)))) {
 629                   report_neq:
 630                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 631                     goto fail_finish;
 632                 }
 633             }
 634             else if (*SvPVX_const(check) != *s
 635                      || ((slen = SvCUR(check)) > 1
 636                          && memNE(SvPVX_const(check), s, slen)))
 637                 goto report_neq;
 638             check_at = s;
 639             goto success_at_start;
 640           }
 641         }
 642         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 643         s = strpos;
 644         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 645         end_shift = prog->check_end_shift;
 646
 647         if (!ml_anch) {
 648             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 649                                          - (SvTAIL(check) != 0);
 650             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 651
 652             if (end_shift < eshift)
 653                 end_shift = eshift;
 654         }
 655     }
 656     else {                              /* Can match at random position */
 657         ml_anch = 0;
 658         s = strpos;
 659         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 660         end_shift = prog->check_end_shift;
 661
 662         /* end shift should be non negative here */
 663     }
 664
 665 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 666     if (end_shift < 0)
 667         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 668                    (IV)end_shift, RX_PRECOMP(prog));
 669 #endif
 670
 671   restart:
 672     /* Find a possible match in the region s..strend by looking for
 673        the "check" substring in the region corrected by start/end_shift. */
 674
 675     {
 676         I32 srch_start_shift = start_shift;
 677         I32 srch_end_shift = end_shift;
 678         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 679             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 680             srch_start_shift = strbeg - s;
 681         }
 682     DEBUG_OPTIMISE_MORE_r({
 683         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 684             (IV)prog->check_offset_min,
 685             (IV)srch_start_shift,
 686             (IV)srch_end_shift,
 687             (IV)prog->check_end_shift);
 688     });
 689
 690     if (flags & REXEC_SCREAM) {
 691         I32 p = -1;                     /* Internal iterator of scream. */
 692         I32 * const pp = data ? data->scream_pos : &p;
 693
 694         if (PL_screamfirst[BmRARE(check)] >= 0
 695             || ( BmRARE(check) == '\n'
 696                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 697                  && SvTAIL(check) ))
 698             s = screaminstr(sv, check,
 699                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 700         else
 701             goto fail_finish;
 702         /* we may be pointing at the wrong string */
 703         if (s && RXp_MATCH_COPIED(prog))
 704             s = strbeg + (s - SvPVX_const(sv));
 705         if (data)
 706             *data->scream_olds = s;
 707     }
 708     else {
 709         U8* start_point;
 710         U8* end_point;
 711         if (prog->extflags & RXf_CANY_SEEN) {
 712             start_point= (U8*)(s + srch_start_shift);
 713             end_point= (U8*)(strend - srch_end_shift);
 714         } else {
 715             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 716             end_point= HOP3(strend, -srch_end_shift, strbeg);
 717         }
 718         DEBUG_OPTIMISE_MORE_r({
 719             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 720                 (int)(end_point - start_point),
 721                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 722                 start_point);
 723         });
 724
 725         s = fbm_instr( start_point, end_point,
 726                       check, multiline ? FBMrf_MULTILINE : 0);
 727     }
 728     }
 729     /* Update the count-of-usability, remove useless subpatterns,
 730         unshift s.  */
 731
 732     DEBUG_EXECUTE_r({
 733         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 734             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 735         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 736                           (s ? "Found" : "Did not find"),
 737             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 738                 ? "anchored" : "floating"),
 739             quoted,
 740             RE_SV_TAIL(check),
 741             (s ? " at offset " : "...\n") );
 742     });
 743
 744     if (!s)
 745         goto fail_finish;
 746     /* Finish the diagnostic message */
 747     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 748
 749     /* XXX dmq: first branch is for positive lookbehind...
 750        Our check string is offset from the beginning of the pattern.
 751        So we need to do any stclass tests offset forward from that
 752        point. I think. :-(
 753      */
 754
 755
 756
 757     check_at=s;
 758
 759
 760     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 761        Start with the other substr.
 762        XXXX no SCREAM optimization yet - and a very coarse implementation
 763        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 764                 *always* match.  Probably should be marked during compile...
 765        Probably it is right to do no SCREAM here...
 766      */
 767
 768     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 769                 : (prog->float_substr && prog->anchored_substr))
 770     {
 771         /* Take into account the "other" substring. */
 772         /* XXXX May be hopelessly wrong for UTF... */
 773         if (!other_last)
 774             other_last = strpos;
 775         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 776           do_other_anchored:
 777             {
 778                 char * const last = HOP3c(s, -start_shift, strbeg);
 779                 char *last1, *last2;
 780                 char * const saved_s = s;
 781                 SV* must;
 782
 783                 t = s - prog->check_offset_max;
 784                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 785                     && (!utf8_target
 786                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 787                             && t > strpos)))
 788                     NOOP;
 789                 else
 790                     t = strpos;
 791                 t = HOP3c(t, prog->anchored_offset, strend);
 792                 if (t < other_last)     /* These positions already checked */
 793                     t = other_last;
 794                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 795                 if (last < last1)
 796                     last1 = last;
 797                 /* XXXX It is not documented what units *_offsets are in.
 798                    We assume bytes, but this is clearly wrong.
 799                    Meaning this code needs to be carefully reviewed for errors.
 800                    dmq.
 801                   */
 802
 803                 /* On end-of-str: see comment below. */
 804                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 805                 if (must == &PL_sv_undef) {
 806                     s = (char*)NULL;
 807                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 808                 }
 809                 else
 810                     s = fbm_instr(
 811                         (unsigned char*)t,
 812                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 813                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 814                         must,
 815                         multiline ? FBMrf_MULTILINE : 0
 816                     );
 817                 DEBUG_EXECUTE_r({
 818                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 819                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 820                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 821                         (s ? "Found" : "Contradicts"),
 822                         quoted, RE_SV_TAIL(must));
 823                 });
 824
 825
 826                 if (!s) {
 827                     if (last1 >= last2) {
 828                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 829                                                 ", giving up...\n"));
 830                         goto fail_finish;
 831                     }
 832                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 833                         ", trying floating at offset %ld...\n",
 834                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 835                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 836                     s = HOP3c(last, 1, strend);
 837                     goto restart;
 838                 }
 839                 else {
 840                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 841                           (long)(s - i_strpos)));
 842                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 843                     other_last = HOP3c(s, 1, strend);
 844                     s = saved_s;
 845                     if (t == strpos)
 846                         goto try_at_start;
 847                     goto try_at_offset;
 848                 }
 849             }
 850         }
 851         else {          /* Take into account the floating substring. */
 852             char *last, *last1;
 853             char * const saved_s = s;
 854             SV* must;
 855
 856             t = HOP3c(s, -start_shift, strbeg);
 857             last1 = last =
 858                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 859             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 860                 last = HOP3c(t, prog->float_max_offset, strend);
 861             s = HOP3c(t, prog->float_min_offset, strend);
 862             if (s < other_last)
 863                 s = other_last;
 864  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 865             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 866             /* fbm_instr() takes into account exact value of end-of-str
 867                if the check is SvTAIL(ed).  Since false positives are OK,
 868                and end-of-str is not later than strend we are OK. */
 869             if (must == &PL_sv_undef) {
 870                 s = (char*)NULL;
 871                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 872             }
 873             else
 874                 s = fbm_instr((unsigned char*)s,
 875                               (unsigned char*)last + SvCUR(must)
 876                                   - (SvTAIL(must)!=0),
 877                               must, multiline ? FBMrf_MULTILINE : 0);
 878             DEBUG_EXECUTE_r({
 879                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 880                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 881                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 882                     (s ? "Found" : "Contradicts"),
 883                     quoted, RE_SV_TAIL(must));
 884             });
 885             if (!s) {
 886                 if (last1 == last) {
 887                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 888                                             ", giving up...\n"));
 889                     goto fail_finish;
 890                 }
 891                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 892                     ", trying anchored starting at offset %ld...\n",
 893                     (long)(saved_s + 1 - i_strpos)));
 894                 other_last = last;
 895                 s = HOP3c(t, 1, strend);
 896                 goto restart;
 897             }
 898             else {
 899                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 900                       (long)(s - i_strpos)));
 901                 other_last = s; /* Fix this later. --Hugo */
 902                 s = saved_s;
 903                 if (t == strpos)
 904                     goto try_at_start;
 905                 goto try_at_offset;
 906             }
 907         }
 908     }
 909
 910
 911     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 912
 913     DEBUG_OPTIMISE_MORE_r(
 914         PerlIO_printf(Perl_debug_log,
 915             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 916             (IV)prog->check_offset_min,
 917             (IV)prog->check_offset_max,
 918             (IV)(s-strpos),
 919             (IV)(t-strpos),
 920             (IV)(t-s),
 921             (IV)(strend-strpos)
 922         )
 923     );
 924
 925     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 926         && (!utf8_target
 927             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 928                  && t > strpos)))
 929     {
 930         /* Fixed substring is found far enough so that the match
 931            cannot start at strpos. */
 932       try_at_offset:
 933         if (ml_anch && t[-1] != '\n') {
 934             /* Eventually fbm_*() should handle this, but often
 935                anchored_offset is not 0, so this check will not be wasted. */
 936             /* XXXX In the code below we prefer to look for "^" even in
 937                presence of anchored substrings.  And we search even
 938                beyond the found float position.  These pessimizations
 939                are historical artefacts only.  */
 940           find_anchor:
 941             while (t < strend - prog->minlen) {
 942                 if (*t == '\n') {
 943                     if (t < check_at - prog->check_offset_min) {
 944                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 945                             /* Since we moved from the found position,
 946                                we definitely contradict the found anchored
 947                                substr.  Due to the above check we do not
 948                                contradict "check" substr.
 949                                Thus we can arrive here only if check substr
 950                                is float.  Redo checking for "other"=="fixed".
 951                              */
 952                             strpos = t + 1;
 953                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 954                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 955                             goto do_other_anchored;
 956                         }
 957                         /* We don't contradict the found floating substring. */
 958                         /* XXXX Why not check for STCLASS? */
 959                         s = t + 1;
 960                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 961                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 962                         goto set_useful;
 963                     }
 964                     /* Position contradicts check-string */
 965                     /* XXXX probably better to look for check-string
 966                        than for "\n", so one should lower the limit for t? */
 967                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 968                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 969                     other_last = strpos = s = t + 1;
 970                     goto restart;
 971                 }
 972                 t++;
 973             }
 974             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 975                         PL_colors[0], PL_colors[1]));
 976             goto fail_finish;
 977         }
 978         else {
 979             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 980                         PL_colors[0], PL_colors[1]));
 981         }
 982         s = t;
 983       set_useful:
 984         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 985     }
 986     else {
 987         /* The found string does not prohibit matching at strpos,
 988            - no optimization of calling REx engine can be performed,
 989            unless it was an MBOL and we are not after MBOL,
 990            or a future STCLASS check will fail this. */
 991       try_at_start:
 992         /* Even in this situation we may use MBOL flag if strpos is offset
 993            wrt the start of the string. */
 994         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 995             && (strpos != strbeg) && strpos[-1] != '\n'
 996             /* May be due to an implicit anchor of m{.*foo}  */
 997             && !(prog->intflags & PREGf_IMPLICIT))
 998         {
 999             t = strpos;
1000             goto find_anchor;
1001         }
1002         DEBUG_EXECUTE_r( if (ml_anch)
1003             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1004                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1005         );
1006       success_at_start:
1007         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1008             && (utf8_target ? (
1009                 prog->check_utf8                /* Could be deleted already */
1010                 && --BmUSEFUL(prog->check_utf8) < 0
1011                 && (prog->check_utf8 == prog->float_utf8)
1012             ) : (
1013                 prog->check_substr              /* Could be deleted already */
1014                 && --BmUSEFUL(prog->check_substr) < 0
1015                 && (prog->check_substr == prog->float_substr)
1016             )))
1017         {
1018             /* If flags & SOMETHING - do not do it many times on the same match */
1019             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1020             /* XXX Does the destruction order has to change with utf8_target? */
1021             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1022             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1023             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1024             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1025             check = NULL;                       /* abort */
1026             s = strpos;
1027             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag
1028                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1029             if (prog->intflags & PREGf_IMPLICIT)
1030                 prog->extflags &= ~RXf_ANCH_MBOL;
1031             /* XXXX This is a remnant of the old implementation.  It
1032                     looks wasteful, since now INTUIT can use many
1033                     other heuristics. */
1034             prog->extflags &= ~RXf_USE_INTUIT;
1035             /* XXXX What other flags might need to be cleared in this branch? */
1036         }
1037         else
1038             s = strpos;
1039     }
1040
1041     /* Last resort... */
1042     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1043     /* trie stclasses are too expensive to use here, we are better off to
1044        leave it to regmatch itself */
1045     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1046         /* minlen == 0 is possible if regstclass is \b or \B,
1047            and the fixed substr is ''$.
1048            Since minlen is already taken into account, s+1 is before strend;
1049            accidentally, minlen >= 1 guaranties no false positives at s + 1
1050            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1051            regstclass does not come from lookahead...  */
1052         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1053            This leaves EXACTF only, which is dealt with in find_byclass().  */
1054         const U8* const str = (U8*)STRING(progi->regstclass);
1055         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1056                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1057                     : 1);
1058         char * endpos;
1059         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1060             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1061         else if (prog->float_substr || prog->float_utf8)
1062             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1063         else
1064             endpos= strend;
1065
1066         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1067                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1068
1069         t = s;
1070         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1071         if (!s) {
1072 #ifdef DEBUGGING
1073             const char *what = NULL;
1074 #endif
1075             if (endpos == strend) {
1076                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1077                                 "Could not match STCLASS...\n") );
1078                 goto fail;
1079             }
1080             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1081                                    "This position contradicts STCLASS...\n") );
1082             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1083                 goto fail;
1084             /* Contradict one of substrings */
1085             if (prog->anchored_substr || prog->anchored_utf8) {
1086                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1087                     DEBUG_EXECUTE_r( what = "anchored" );
1088                   hop_and_restart:
1089                     s = HOP3c(t, 1, strend);
1090                     if (s + start_shift + end_shift > strend) {
1091                         /* XXXX Should be taken into account earlier? */
1092                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1093                                                "Could not match STCLASS...\n") );
1094                         goto fail;
1095                     }
1096                     if (!check)
1097                         goto giveup;
1098                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1099                                 "Looking for %s substr starting at offset %ld...\n",
1100                                  what, (long)(s + start_shift - i_strpos)) );
1101                     goto restart;
1102                 }
1103                 /* Have both, check_string is floating */
1104                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1105                     goto retry_floating_check;
1106                 /* Recheck anchored substring, but not floating... */
1107                 s = check_at;
1108                 if (!check)
1109                     goto giveup;
1110                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1111                           "Looking for anchored substr starting at offset %ld...\n",
1112                           (long)(other_last - i_strpos)) );
1113                 goto do_other_anchored;
1114             }
1115             /* Another way we could have checked stclass at the
1116                current position only: */
1117             if (ml_anch) {
1118                 s = t = t + 1;
1119                 if (!check)
1120                     goto giveup;
1121                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1122                           "Looking for /%s^%s/m starting at offset %ld...\n",
1123                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1124                 goto try_at_offset;
1125             }
1126             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1127                 goto fail;
1128             /* Check is floating subtring. */
1129           retry_floating_check:
1130             t = check_at - start_shift;
1131             DEBUG_EXECUTE_r( what = "floating" );
1132             goto hop_and_restart;
1133         }
1134         if (t != s) {
1135             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1136                         "By STCLASS: moving %ld --> %ld\n",
1137                                   (long)(t - i_strpos), (long)(s - i_strpos))
1138                    );
1139         }
1140         else {
1141             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1142                                   "Does not contradict STCLASS...\n");
1143                    );
1144         }
1145     }
1146   giveup:
1147     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1148                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1149                           PL_colors[5], (long)(s - i_strpos)) );
1150     return s;
1151
1152   fail_finish:                          /* Substring not found */
1153     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1154         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1155   fail:
1156     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1157                           PL_colors[4], PL_colors[5]));
1158     return NULL;
1159 }
1160
1161 #define DECL_TRIE_TYPE(scan) \
1162     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1163                     trie_type = (scan->flags != EXACT) \
1164                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1165                               : (utf8_target ? trie_utf8 : trie_plain)
1166
1167 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1168 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1169     switch (trie_type) {                                                    \
1170     case trie_utf8_fold:                                                    \
1171         if ( foldlen>0 ) {                                                  \
1172             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1173             foldlen -= len;                                                 \
1174             uscan += len;                                                   \
1175             len=0;                                                          \
1176         } else {                                                            \
1177             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1178             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1179             foldlen -= UNISKIP( uvc );                                      \
1180             uscan = foldbuf + UNISKIP( uvc );                               \
1181         }                                                                   \
1182         break;                                                              \
1183     case trie_latin_utf8_fold:                                              \
1184         if ( foldlen>0 ) {                                                  \
1185             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1186             foldlen -= len;                                                 \
1187             uscan += len;                                                   \
1188             len=0;                                                          \
1189         } else {                                                            \
1190             len = 1;                                                        \
1191             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1192             foldlen -= UNISKIP( uvc );                                      \
1193             uscan = foldbuf + UNISKIP( uvc );                               \
1194         }                                                                   \
1195         break;                                                              \
1196     case trie_utf8:                                                         \
1197         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1198         break;                                                              \
1199     case trie_plain:                                                        \
1200         uvc = (UV)*uc;                                                      \
1201         len = 1;                                                            \
1202     }                                                                       \
1203     if (uvc < 256) {                                                        \
1204         charid = trie->charmap[ uvc ];                                      \
1205     }                                                                       \
1206     else {                                                                  \
1207         charid = 0;                                                         \
1208         if (widecharmap) {                                                  \
1209             SV** const svpp = hv_fetch(widecharmap,                         \
1210                         (char*)&uvc, sizeof(UV), 0);                        \
1211             if (svpp)                                                       \
1212                 charid = (U16)SvIV(*svpp);                                  \
1213         }                                                                   \
1214     }                                                                       \
1215 } STMT_END
1216
1217 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1218 {                                                      \
1219     char *my_strend= (char *)strend;                   \
1220     if ( (CoNd)                                        \
1221          && (ln == len ||                              \
1222              foldEQ_utf8(s, &my_strend, 0,  utf8_target,   \
1223                         m, NULL, ln, cBOOL(UTF_PATTERN)))      \
1224          && (!reginfo || regtry(reginfo, &s)) )        \
1225         goto got_it;                                   \
1226     else {                                             \
1227          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1228          uvchr_to_utf8(tmpbuf, c);                     \
1229          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1230          if ( f != c                                   \
1231               && (f == c1 || f == c2)                  \
1232               && (ln == len ||                         \
1233                 foldEQ_utf8(s, &my_strend, 0,  utf8_target,\
1234                               m, NULL, ln, cBOOL(UTF_PATTERN)))\
1235               && (!reginfo || regtry(reginfo, &s)) )   \
1236               goto got_it;                             \
1237     }                                                  \
1238 }                                                      \
1239 s += len
1240
1241 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1242 STMT_START {                                              \
1243     while (s <= e) {                                      \
1244         if ( (CoNd)                                       \
1245              && (ln == 1 || (OP(c) == EXACTF             \
1246                               ? foldEQ(s, m, ln)           \
1247                               : foldEQ_locale(s, m, ln)))  \
1248              && (!reginfo || regtry(reginfo, &s)) )        \
1249             goto got_it;                                  \
1250         s++;                                              \
1251     }                                                     \
1252 } STMT_END
1253
1254 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1255 STMT_START {                                          \
1256     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1257         CoDe                                          \
1258         s += uskip;                                   \
1259     }                                                 \
1260 } STMT_END
1261
1262 #define REXEC_FBC_SCAN(CoDe)                          \
1263 STMT_START {                                          \
1264     while (s < strend) {                              \
1265         CoDe                                          \
1266         s++;                                          \
1267     }                                                 \
1268 } STMT_END
1269
1270 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1271 REXEC_FBC_UTF8_SCAN(                                  \
1272     if (CoNd) {                                       \
1273         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1274             goto got_it;                              \
1275         else                                          \
1276             tmp = doevery;                            \
1277     }                                                 \
1278     else                                              \
1279         tmp = 1;                                      \
1280 )
1281
1282 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1283 REXEC_FBC_SCAN(                                       \
1284     if (CoNd) {                                       \
1285         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1286             goto got_it;                              \
1287         else                                          \
1288             tmp = doevery;                            \
1289     }                                                 \
1290     else                                              \
1291         tmp = 1;                                      \
1292 )
1293
1294 #define REXEC_FBC_TRYIT               \
1295 if ((!reginfo || regtry(reginfo, &s))) \
1296     goto got_it
1297
1298 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1299     if (utf8_target) {                                             \
1300         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1301     }                                                          \
1302     else {                                                     \
1303         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1304     }                                                          \
1305     break
1306
1307 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1308     if (utf8_target) {                                             \
1309         UtFpReLoAd;                                            \
1310         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1311     }                                                          \
1312     else {                                                     \
1313         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1314     }                                                          \
1315     break
1316
1317 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1318     PL_reg_flags |= RF_tainted;                                \
1319     if (utf8_target) {                                             \
1320         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1321     }                                                          \
1322     else {                                                     \
1323         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1324     }                                                          \
1325     break
1326
1327 #define DUMP_EXEC_POS(li,s,doutf8) \
1328     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1329
1330 /* We know what class REx starts with.  Try to find this position... */
1331 /* if reginfo is NULL, its a dryrun */
1332 /* annoyingly all the vars in this routine have different names from their counterparts
1333    in regmatch. /grrr */
1334
1335 STATIC char *
1336 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1337     const char *strend, regmatch_info *reginfo)
1338 {
1339         dVAR;
1340         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1341         char *m;
1342         STRLEN ln;
1343         STRLEN lnc;
1344         register STRLEN uskip;
1345         unsigned int c1;
1346         unsigned int c2;
1347         char *e;
1348         register I32 tmp = 1;   /* Scratch variable? */
1349         register const bool utf8_target = PL_reg_match_utf8;
1350         RXi_GET_DECL(prog,progi);
1351
1352         PERL_ARGS_ASSERT_FIND_BYCLASS;
1353
1354         /* We know what class it must start with. */
1355         switch (OP(c)) {
1356         case ANYOF:
1357             if (utf8_target) {
1358                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1359                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1360                           reginclass(prog, c, (U8*)s, 0, utf8_target) :
1361                           REGINCLASS(prog, c, (U8*)s));
1362             }
1363             else {
1364                  while (s < strend) {
1365                       STRLEN skip = 1;
1366
1367                       if (REGINCLASS(prog, c, (U8*)s) ||
1368                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1369                            /* The assignment of 2 is intentional:
1370                             * for the folded sharp s, the skip is 2. */
1371                            (skip = SHARP_S_SKIP))) {
1372                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1373                                 goto got_it;
1374                            else
1375                                 tmp = doevery;
1376                       }
1377                       else
1378                            tmp = 1;
1379                       s += skip;
1380                  }
1381             }
1382             break;
1383         case CANY:
1384             REXEC_FBC_SCAN(
1385                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1386                     goto got_it;
1387                 else
1388                     tmp = doevery;
1389             );
1390             break;
1391         case EXACTF:
1392             m   = STRING(c);
1393             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1394             lnc = (I32) ln;     /* length to match in characters */
1395             if (UTF_PATTERN) {
1396                 STRLEN ulen1, ulen2;
1397                 U8 *sm = (U8 *) m;
1398                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1399                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1400                 /* used by commented-out code below */
1401                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1402
1403                 /* XXX: Since the node will be case folded at compile
1404                    time this logic is a little odd, although im not
1405                    sure that its actually wrong. --dmq */
1406
1407                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1408                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1409
1410                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1411                    codepoint of the first character in the converted
1412                    form, yet originally we did the extra step.
1413                    No tests fail by commenting this code out however
1414                    so Ive left it out. -- dmq.
1415
1416                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1417                                     0, uniflags);
1418                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1419                                     0, uniflags);
1420                 */
1421
1422                 lnc = 0;
1423                 while (sm < ((U8 *) m + ln)) {
1424                     lnc++;
1425                     sm += UTF8SKIP(sm);
1426                 }
1427             }
1428             else {
1429                 c1 = *(U8*)m;
1430                 c2 = PL_fold[c1];
1431             }
1432             goto do_exactf;
1433         case EXACTFL:
1434             m   = STRING(c);
1435             ln  = STR_LEN(c);
1436             lnc = (I32) ln;
1437             c1 = *(U8*)m;
1438             c2 = PL_fold_locale[c1];
1439           do_exactf:
1440             e = HOP3c(strend, -((I32)lnc), s);
1441
1442             if (!reginfo && e < s)
1443                 e = s;                  /* Due to minlen logic of intuit() */
1444
1445             /* The idea in the EXACTF* cases is to first find the
1446              * first character of the EXACTF* node and then, if
1447              * necessary, case-insensitively compare the full
1448              * text of the node.  The c1 and c2 are the first
1449              * characters (though in Unicode it gets a bit
1450              * more complicated because there are more cases
1451              * than just upper and lower: one needs to use
1452              * the so-called folding case for case-insensitive
1453              * matching (called "loose matching" in Unicode).
1454              * foldEQ_utf8() will do just that. */
1455
1456             if (utf8_target || UTF_PATTERN) {
1457                 UV c, f;
1458                 U8 tmpbuf [UTF8_MAXBYTES+1];
1459                 STRLEN len = 1;
1460                 STRLEN foldlen;
1461                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1462                 if (c1 == c2) {
1463                     /* Upper and lower of 1st char are equal -
1464                      * probably not a "letter". */
1465                     while (s <= e) {
1466                         if (utf8_target) {
1467                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1468                                            uniflags);
1469                         } else {
1470                             c = *((U8*)s);
1471                         }
1472                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1473                     }
1474                 }
1475                 else {
1476                     while (s <= e) {
1477                         if (utf8_target) {
1478                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1479                                            uniflags);
1480                         } else {
1481                             c = *((U8*)s);
1482                         }
1483
1484                         /* Handle some of the three Greek sigmas cases.
1485                          * Note that not all the possible combinations
1486                          * are handled here: some of them are handled
1487                          * by the standard folding rules, and some of
1488                          * them (the character class or ANYOF cases)
1489                          * are handled during compiletime in
1490                          * regexec.c:S_regclass(). */
1491                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1492                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1493                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1494
1495                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1496                     }
1497                 }
1498             }
1499             else {
1500                 /* Neither pattern nor string are UTF8 */
1501                 if (c1 == c2)
1502                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1503                 else
1504                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1505             }
1506             break;
1507         case BOUNDL:
1508             PL_reg_flags |= RF_tainted;
1509             /* FALL THROUGH */
1510         case BOUND:
1511             if (utf8_target) {
1512                 if (s == PL_bostr)
1513                     tmp = '\n';
1514                 else {
1515                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1516                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1517                 }
1518                 tmp = ((OP(c) == BOUND ?
1519                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1520                 LOAD_UTF8_CHARCLASS_ALNUM();
1521                 REXEC_FBC_UTF8_SCAN(
1522                     if (tmp == !(OP(c) == BOUND ?
1523                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1524                                  isALNUM_LC_utf8((U8*)s)))
1525                     {
1526                         tmp = !tmp;
1527                         REXEC_FBC_TRYIT;
1528                 }
1529                 );
1530             }
1531             else {  /* Not utf8 */
1532                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1533                 tmp = cBOOL((OP(c) == BOUNDL)
1534                             ? isALNUM_LC(tmp)
1535                             : (isWORDCHAR_L1(tmp)
1536                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1537                 REXEC_FBC_SCAN(
1538                     if (tmp ==
1539                         !((OP(c) == BOUNDL)
1540                           ? isALNUM_LC(*s)
1541                           : (isWORDCHAR_L1((U8) *s)
1542                              && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1543                     {
1544                         tmp = !tmp;
1545                         REXEC_FBC_TRYIT;
1546                 }
1547                 );
1548             }
1549             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1550                 goto got_it;
1551             break;
1552         case NBOUNDL:
1553             PL_reg_flags |= RF_tainted;
1554             /* FALL THROUGH */
1555         case NBOUND:
1556             if (utf8_target) {
1557                 if (s == PL_bostr)
1558                     tmp = '\n';
1559                 else {
1560                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1561                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1562                 }
1563                 tmp = ((OP(c) == NBOUND ?
1564                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1565                 LOAD_UTF8_CHARCLASS_ALNUM();
1566                 REXEC_FBC_UTF8_SCAN(
1567                     if (tmp == !(OP(c) == NBOUND ?
1568                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1569                                  isALNUM_LC_utf8((U8*)s)))
1570                         tmp = !tmp;
1571                     else REXEC_FBC_TRYIT;
1572                 );
1573             }
1574             else {
1575                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1576                 tmp = cBOOL((OP(c) == NBOUNDL)
1577                             ? isALNUM_LC(tmp)
1578                             : (isWORDCHAR_L1(tmp)
1579                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1580                 REXEC_FBC_SCAN(
1581                     if (tmp == ! cBOOL(
1582                             (OP(c) == NBOUNDL)
1583                             ? isALNUM_LC(*s)
1584                             : (isWORDCHAR_L1((U8) *s)
1585                                && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1586                     {
1587                         tmp = !tmp;
1588                     }
1589                     else REXEC_FBC_TRYIT;
1590                 );
1591             }
1592             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1593                 goto got_it;
1594             break;
1595         case ALNUM:
1596             REXEC_FBC_CSCAN_PRELOAD(
1597                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1598                 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1599                 (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
1600             );
1601         case ALNUML:
1602             REXEC_FBC_CSCAN_TAINT(
1603                 isALNUM_LC_utf8((U8*)s),
1604                 isALNUM_LC(*s)
1605             );
1606         case NALNUM:
1607             REXEC_FBC_CSCAN_PRELOAD(
1608                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1609                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1610                 ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
1611             );
1612         case NALNUML:
1613             REXEC_FBC_CSCAN_TAINT(
1614                 !isALNUM_LC_utf8((U8*)s),
1615                 !isALNUM_LC(*s)
1616             );
1617         case SPACE:
1618             REXEC_FBC_CSCAN_PRELOAD(
1619                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1620                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1621                 isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
1622             );
1623         case SPACEL:
1624             REXEC_FBC_CSCAN_TAINT(
1625                 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1626                 isSPACE_LC(*s)
1627             );
1628         case NSPACE:
1629             REXEC_FBC_CSCAN_PRELOAD(
1630                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1631                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1632                 !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
1633             );
1634         case NSPACEL:
1635             REXEC_FBC_CSCAN_TAINT(
1636                 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1637                 !isSPACE_LC(*s)
1638             );
1639         case DIGIT:
1640             REXEC_FBC_CSCAN_PRELOAD(
1641                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1642                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1643                 isDIGIT(*s)
1644             );
1645         case DIGITL:
1646             REXEC_FBC_CSCAN_TAINT(
1647                 isDIGIT_LC_utf8((U8*)s),
1648                 isDIGIT_LC(*s)
1649             );
1650         case NDIGIT:
1651             REXEC_FBC_CSCAN_PRELOAD(
1652                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1653                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1654                 !isDIGIT(*s)
1655             );
1656         case NDIGITL:
1657             REXEC_FBC_CSCAN_TAINT(
1658                 !isDIGIT_LC_utf8((U8*)s),
1659                 !isDIGIT_LC(*s)
1660             );
1661         case LNBREAK:
1662             REXEC_FBC_CSCAN(
1663                 is_LNBREAK_utf8(s),
1664                 is_LNBREAK_latin1(s)
1665             );
1666         case VERTWS:
1667             REXEC_FBC_CSCAN(
1668                 is_VERTWS_utf8(s),
1669                 is_VERTWS_latin1(s)
1670             );
1671         case NVERTWS:
1672             REXEC_FBC_CSCAN(
1673                 !is_VERTWS_utf8(s),
1674                 !is_VERTWS_latin1(s)
1675             );
1676         case HORIZWS:
1677             REXEC_FBC_CSCAN(
1678                 is_HORIZWS_utf8(s),
1679                 is_HORIZWS_latin1(s)
1680             );
1681         case NHORIZWS:
1682             REXEC_FBC_CSCAN(
1683                 !is_HORIZWS_utf8(s),
1684                 !is_HORIZWS_latin1(s)
1685             );
1686         case AHOCORASICKC:
1687         case AHOCORASICK:
1688             {
1689                 DECL_TRIE_TYPE(c);
1690                 /* what trie are we using right now */
1691                 reg_ac_data *aho
1692                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1693                 reg_trie_data *trie
1694                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1695                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1696
1697                 const char *last_start = strend - trie->minlen;
1698 #ifdef DEBUGGING
1699                 const char *real_start = s;
1700 #endif
1701                 STRLEN maxlen = trie->maxlen;
1702                 SV *sv_points;
1703                 U8 **points; /* map of where we were in the input string
1704                                 when reading a given char. For ASCII this
1705                                 is unnecessary overhead as the relationship
1706                                 is always 1:1, but for Unicode, especially
1707                                 case folded Unicode this is not true. */
1708                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1709                 U8 *bitmap=NULL;
1710
1711
1712                 GET_RE_DEBUG_FLAGS_DECL;
1713
1714                 /* We can't just allocate points here. We need to wrap it in
1715                  * an SV so it gets freed properly if there is a croak while
1716                  * running the match */
1717                 ENTER;
1718                 SAVETMPS;
1719                 sv_points=newSV(maxlen * sizeof(U8 *));
1720                 SvCUR_set(sv_points,
1721                     maxlen * sizeof(U8 *));
1722                 SvPOK_on(sv_points);
1723                 sv_2mortal(sv_points);
1724                 points=(U8**)SvPV_nolen(sv_points );
1725                 if ( trie_type != trie_utf8_fold
1726                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1727                 {
1728                     if (trie->bitmap)
1729                         bitmap=(U8*)trie->bitmap;
1730                     else
1731                         bitmap=(U8*)ANYOF_BITMAP(c);
1732                 }
1733                 /* this is the Aho-Corasick algorithm modified a touch
1734                    to include special handling for long "unknown char"
1735                    sequences. The basic idea being that we use AC as long
1736                    as we are dealing with a possible matching char, when
1737                    we encounter an unknown char (and we have not encountered
1738                    an accepting state) we scan forward until we find a legal
1739                    starting char.
1740                    AC matching is basically that of trie matching, except
1741                    that when we encounter a failing transition, we fall back
1742                    to the current states "fail state", and try the current char
1743                    again, a process we repeat until we reach the root state,
1744                    state 1, or a legal transition. If we fail on the root state
1745                    then we can either terminate if we have reached an accepting
1746                    state previously, or restart the entire process from the beginning
1747                    if we have not.
1748
1749                  */
1750                 while (s <= last_start) {
1751                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1752                     U8 *uc = (U8*)s;
1753                     U16 charid = 0;
1754                     U32 base = 1;
1755                     U32 state = 1;
1756                     UV uvc = 0;
1757                     STRLEN len = 0;
1758                     STRLEN foldlen = 0;
1759                     U8 *uscan = (U8*)NULL;
1760                     U8 *leftmost = NULL;
1761 #ifdef DEBUGGING
1762                     U32 accepted_word= 0;
1763 #endif
1764                     U32 pointpos = 0;
1765
1766                     while ( state && uc <= (U8*)strend ) {
1767                         int failed=0;
1768                         U32 word = aho->states[ state ].wordnum;
1769
1770                         if( state==1 ) {
1771                             if ( bitmap ) {
1772                                 DEBUG_TRIE_EXECUTE_r(
1773                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1774                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1775                                             (char *)uc, utf8_target );
1776                                         PerlIO_printf( Perl_debug_log,
1777                                             " Scanning for legal start char...\n");
1778                                     }
1779                                 );
1780                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1781                                     uc++;
1782                                 }
1783                                 s= (char *)uc;
1784                             }
1785                             if (uc >(U8*)last_start) break;
1786                         }
1787
1788                         if ( word ) {
1789                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1790                             if (!leftmost || lpos < leftmost) {
1791                                 DEBUG_r(accepted_word=word);
1792                                 leftmost= lpos;
1793                             }
1794                             if (base==0) break;
1795
1796                         }
1797                         points[pointpos++ % maxlen]= uc;
1798                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1799                                              uscan, len, uvc, charid, foldlen,
1800                                              foldbuf, uniflags);
1801                         DEBUG_TRIE_EXECUTE_r({
1802                             dump_exec_pos( (char *)uc, c, strend, real_start,
1803                                 s,   utf8_target );
1804                             PerlIO_printf(Perl_debug_log,
1805                                 " Charid:%3u CP:%4"UVxf" ",
1806                                  charid, uvc);
1807                         });
1808
1809                         do {
1810 #ifdef DEBUGGING
1811                             word = aho->states[ state ].wordnum;
1812 #endif
1813                             base = aho->states[ state ].trans.base;
1814
1815                             DEBUG_TRIE_EXECUTE_r({
1816                                 if (failed)
1817                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1818                                         s,   utf8_target );
1819                                 PerlIO_printf( Perl_debug_log,
1820                                     "%sState: %4"UVxf", word=%"UVxf,
1821                                     failed ? " Fail transition to " : "",
1822                                     (UV)state, (UV)word);
1823                             });
1824                             if ( base ) {
1825                                 U32 tmp;
1826                                 I32 offset;
1827                                 if (charid &&
1828                                      ( ((offset = base + charid
1829                                         - 1 - trie->uniquecharcount)) >= 0)
1830                                      && ((U32)offset < trie->lasttrans)
1831                                      && trie->trans[offset].check == state
1832                                      && (tmp=trie->trans[offset].next))
1833                                 {
1834                                     DEBUG_TRIE_EXECUTE_r(
1835                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1836                                     state = tmp;
1837                                     break;
1838                                 }
1839                                 else {
1840                                     DEBUG_TRIE_EXECUTE_r(
1841                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1842                                     failed = 1;
1843                                     state = aho->fail[state];
1844                                 }
1845                             }
1846                             else {
1847                                 /* we must be accepting here */
1848                                 DEBUG_TRIE_EXECUTE_r(
1849                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1850                                 failed = 1;
1851                                 break;
1852                             }
1853                         } while(state);
1854                         uc += len;
1855                         if (failed) {
1856                             if (leftmost)
1857                                 break;
1858                             if (!state) state = 1;
1859                         }
1860                     }
1861                     if ( aho->states[ state ].wordnum ) {
1862                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1863                         if (!leftmost || lpos < leftmost) {
1864                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1865                             leftmost = lpos;
1866                         }
1867                     }
1868                     if (leftmost) {
1869                         s = (char*)leftmost;
1870                         DEBUG_TRIE_EXECUTE_r({
1871                             PerlIO_printf(
1872                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1873                                 (UV)accepted_word, (IV)(s - real_start)
1874                             );
1875                         });
1876                         if (!reginfo || regtry(reginfo, &s)) {
1877                             FREETMPS;
1878                             LEAVE;
1879                             goto got_it;
1880                         }
1881                         s = HOPc(s,1);
1882                         DEBUG_TRIE_EXECUTE_r({
1883                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1884                         });
1885                     } else {
1886                         DEBUG_TRIE_EXECUTE_r(
1887                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1888                         break;
1889                     }
1890                 }
1891                 FREETMPS;
1892                 LEAVE;
1893             }
1894             break;
1895         default:
1896             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1897             break;
1898         }
1899         return 0;
1900       got_it:
1901         return s;
1902 }
1903
1904
1905 /*
1906  - regexec_flags - match a regexp against a string
1907  */
1908 I32
1909 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1910               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1911 /* strend: pointer to null at end of string */
1912 /* strbeg: real beginning of string */
1913 /* minend: end of match must be >=minend after stringarg. */
1914 /* data: May be used for some additional optimizations.
1915          Currently its only used, with a U32 cast, for transmitting
1916          the ganch offset when doing a /g match. This will change */
1917 /* nosave: For optimizations. */
1918 {
1919     dVAR;
1920     struct regexp *const prog = (struct regexp *)SvANY(rx);
1921     /*register*/ char *s;
1922     register regnode *c;
1923     /*register*/ char *startpos = stringarg;
1924     I32 minlen;         /* must match at least this many chars */
1925     I32 dontbother = 0; /* how many characters not to try at end */
1926     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1927     I32 scream_pos = -1;                /* Internal iterator of scream. */
1928     char *scream_olds = NULL;
1929     const bool utf8_target = cBOOL(DO_UTF8(sv));
1930     I32 multiline;
1931     RXi_GET_DECL(prog,progi);
1932     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1933     regexp_paren_pair *swap = NULL;
1934     GET_RE_DEBUG_FLAGS_DECL;
1935
1936     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1937     PERL_UNUSED_ARG(data);
1938
1939     /* Be paranoid... */
1940     if (prog == NULL || startpos == NULL) {
1941         Perl_croak(aTHX_ "NULL regexp parameter");
1942         return 0;
1943     }
1944
1945     multiline = prog->extflags & RXf_PMf_MULTILINE;
1946     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1947
1948     RX_MATCH_UTF8_set(rx, utf8_target);
1949     DEBUG_EXECUTE_r(
1950         debug_start_match(rx, utf8_target, startpos, strend,
1951         "Matching");
1952     );
1953
1954     minlen = prog->minlen;
1955
1956     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1957         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1958                               "String too short [regexec_flags]...\n"));
1959         goto phooey;
1960     }
1961
1962
1963     /* Check validity of program. */
1964     if (UCHARAT(progi->program) != REG_MAGIC) {
1965         Perl_croak(aTHX_ "corrupted regexp program");
1966     }
1967
1968     PL_reg_flags = 0;
1969     PL_reg_eval_set = 0;
1970     PL_reg_maxiter = 0;
1971
1972     if (RX_UTF8(rx))
1973         PL_reg_flags |= RF_utf8;
1974
1975     /* Mark beginning of line for ^ and lookbehind. */
1976     reginfo.bol = startpos; /* XXX not used ??? */
1977     PL_bostr  = strbeg;
1978     reginfo.sv = sv;
1979
1980     /* Mark end of line for $ (and such) */
1981     PL_regeol = strend;
1982
1983     /* see how far we have to get to not match where we matched before */
1984     reginfo.till = startpos+minend;
1985
1986     /* If there is a "must appear" string, look for it. */
1987     s = startpos;
1988
1989     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1990         MAGIC *mg;
1991         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
1992             reginfo.ganch = startpos + prog->gofs;
1993             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1994               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1995         } else if (sv && SvTYPE(sv) >= SVt_PVMG
1996                   && SvMAGIC(sv)
1997                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1998                   && mg->mg_len >= 0) {
1999             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2000             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2001                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2002
2003             if (prog->extflags & RXf_ANCH_GPOS) {
2004                 if (s > reginfo.ganch)
2005                     goto phooey;
2006                 s = reginfo.ganch - prog->gofs;
2007                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2008                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2009                 if (s < strbeg)
2010                     goto phooey;
2011             }
2012         }
2013         else if (data) {
2014             reginfo.ganch = strbeg + PTR2UV(data);
2015             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2016                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2017
2018         } else {                                /* pos() not defined */
2019             reginfo.ganch = strbeg;
2020             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2021                  "GPOS: reginfo.ganch = strbeg\n"));
2022         }
2023     }
2024     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2025         /* We have to be careful. If the previous successful match
2026            was from this regex we don't want a subsequent partially
2027            successful match to clobber the old results.
2028            So when we detect this possibility we add a swap buffer
2029            to the re, and switch the buffer each match. If we fail
2030            we switch it back, otherwise we leave it swapped.
2031         */
2032         swap = prog->offs;
2033         /* do we need a save destructor here for eval dies? */
2034         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2035     }
2036     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2037         re_scream_pos_data d;
2038
2039         d.scream_olds = &scream_olds;
2040         d.scream_pos = &scream_pos;
2041         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2042         if (!s) {
2043             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2044             goto phooey;        /* not present */
2045         }
2046     }
2047
2048
2049
2050     /* Simplest case:  anchored match need be tried only once. */
2051     /*  [unless only anchor is BOL and multiline is set] */
2052     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2053         if (s == startpos && regtry(&reginfo, &startpos))
2054             goto got_it;
2055         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2056                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2057         {
2058             char *end;
2059
2060             if (minlen)
2061                 dontbother = minlen - 1;
2062             end = HOP3c(strend, -dontbother, strbeg) - 1;
2063             /* for multiline we only have to try after newlines */
2064             if (prog->check_substr || prog->check_utf8) {
2065                 /* because of the goto we can not easily reuse the macros for bifurcating the
2066                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2067                 if (utf8_target) {
2068                     if (s == startpos)
2069                         goto after_try_utf8;
2070                     while (1) {
2071                         if (regtry(&reginfo, &s)) {
2072                             goto got_it;
2073                         }
2074                       after_try_utf8:
2075                         if (s > end) {
2076                             goto phooey;
2077                         }
2078                         if (prog->extflags & RXf_USE_INTUIT) {
2079                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2080                             if (!s) {
2081                                 goto phooey;
2082                             }
2083                         }
2084                         else {
2085                             s += UTF8SKIP(s);
2086                         }
2087                     }
2088                 } /* end search for check string in unicode */
2089                 else {
2090                     if (s == startpos) {
2091                         goto after_try_latin;
2092                     }
2093                     while (1) {
2094                         if (regtry(&reginfo, &s)) {
2095                             goto got_it;
2096                         }
2097                       after_try_latin:
2098                         if (s > end) {
2099                             goto phooey;
2100                         }
2101                         if (prog->extflags & RXf_USE_INTUIT) {
2102                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2103                             if (!s) {
2104                                 goto phooey;
2105                             }
2106                         }
2107                         else {
2108                             s++;
2109                         }
2110                     }
2111                 } /* end search for check string in latin*/
2112             } /* end search for check string */
2113             else { /* search for newline */
2114                 if (s > startpos) {
2115                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2116                     s--;
2117                 }
2118                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2119                 while (s < end) {
2120                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2121                         if (regtry(&reginfo, &s))
2122                             goto got_it;
2123                     }
2124                 }
2125             } /* end search for newline */
2126         } /* end anchored/multiline check string search */
2127         goto phooey;
2128     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2129     {
2130         /* the warning about reginfo.ganch being used without intialization
2131            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2132            and we only enter this block when the same bit is set. */
2133         char *tmp_s = reginfo.ganch - prog->gofs;
2134
2135         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2136             goto got_it;
2137         goto phooey;
2138     }
2139
2140     /* Messy cases:  unanchored match. */
2141     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2142         /* we have /x+whatever/ */
2143         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2144         char ch;
2145 #ifdef DEBUGGING
2146         int did_match = 0;
2147 #endif
2148         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2149             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2150         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2151
2152         if (utf8_target) {
2153             REXEC_FBC_SCAN(
2154                 if (*s == ch) {
2155                     DEBUG_EXECUTE_r( did_match = 1 );
2156                     if (regtry(&reginfo, &s)) goto got_it;
2157                     s += UTF8SKIP(s);
2158                     while (s < strend && *s == ch)
2159                         s += UTF8SKIP(s);
2160                 }
2161             );
2162         }
2163         else {
2164             REXEC_FBC_SCAN(
2165                 if (*s == ch) {
2166                     DEBUG_EXECUTE_r( did_match = 1 );
2167                     if (regtry(&reginfo, &s)) goto got_it;
2168                     s++;
2169                     while (s < strend && *s == ch)
2170                         s++;
2171                 }
2172             );
2173         }
2174         DEBUG_EXECUTE_r(if (!did_match)
2175                 PerlIO_printf(Perl_debug_log,
2176                                   "Did not find anchored character...\n")
2177                );
2178     }
2179     else if (prog->anchored_substr != NULL
2180               || prog->anchored_utf8 != NULL
2181               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2182                   && prog->float_max_offset < strend - s)) {
2183         SV *must;
2184         I32 back_max;
2185         I32 back_min;
2186         char *last;
2187         char *last1;            /* Last position checked before */
2188 #ifdef DEBUGGING
2189         int did_match = 0;
2190 #endif
2191         if (prog->anchored_substr || prog->anchored_utf8) {
2192             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2193                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2194             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2195             back_max = back_min = prog->anchored_offset;
2196         } else {
2197             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2198                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2199             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2200             back_max = prog->float_max_offset;
2201             back_min = prog->float_min_offset;
2202         }
2203
2204
2205         if (must == &PL_sv_undef)
2206             /* could not downgrade utf8 check substring, so must fail */
2207             goto phooey;
2208
2209         if (back_min<0) {
2210             last = strend;
2211         } else {
2212             last = HOP3c(strend,        /* Cannot start after this */
2213                   -(I32)(CHR_SVLEN(must)
2214                          - (SvTAIL(must) != 0) + back_min), strbeg);
2215         }
2216         if (s > PL_bostr)
2217             last1 = HOPc(s, -1);
2218         else
2219             last1 = s - 1;      /* bogus */
2220
2221         /* XXXX check_substr already used to find "s", can optimize if
2222            check_substr==must. */
2223         scream_pos = -1;
2224         dontbother = end_shift;
2225         strend = HOPc(strend, -dontbother);
2226         while ( (s <= last) &&
2227                 ((flags & REXEC_SCREAM)
2228                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2229                                     end_shift, &scream_pos, 0))
2230                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2231                                   (unsigned char*)strend, must,
2232                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2233             /* we may be pointing at the wrong string */
2234             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2235                 s = strbeg + (s - SvPVX_const(sv));
2236             DEBUG_EXECUTE_r( did_match = 1 );
2237             if (HOPc(s, -back_max) > last1) {
2238                 last1 = HOPc(s, -back_min);
2239                 s = HOPc(s, -back_max);
2240             }
2241             else {
2242                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2243
2244                 last1 = HOPc(s, -back_min);
2245                 s = t;
2246             }
2247             if (utf8_target) {
2248                 while (s <= last1) {
2249                     if (regtry(&reginfo, &s))
2250                         goto got_it;
2251                     s += UTF8SKIP(s);
2252                 }
2253             }
2254             else {
2255                 while (s <= last1) {
2256                     if (regtry(&reginfo, &s))
2257                         goto got_it;
2258                     s++;
2259                 }
2260             }
2261         }
2262         DEBUG_EXECUTE_r(if (!did_match) {
2263             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2264                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2265             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2266                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2267                                ? "anchored" : "floating"),
2268                 quoted, RE_SV_TAIL(must));
2269         });
2270         goto phooey;
2271     }
2272     else if ( (c = progi->regstclass) ) {
2273         if (minlen) {
2274             const OPCODE op = OP(progi->regstclass);
2275             /* don't bother with what can't match */
2276             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2277                 strend = HOPc(strend, -(minlen - 1));
2278         }
2279         DEBUG_EXECUTE_r({
2280             SV * const prop = sv_newmortal();
2281             regprop(prog, prop, c);
2282             {
2283                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2284                     s,strend-s,60);
2285                 PerlIO_printf(Perl_debug_log,
2286                     "Matching stclass %.*s against %s (%d bytes)\n",
2287                     (int)SvCUR(prop), SvPVX_const(prop),
2288                      quoted, (int)(strend - s));
2289             }
2290         });
2291         if (find_byclass(prog, c, s, strend, &reginfo))
2292             goto got_it;
2293         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2294     }
2295     else {
2296         dontbother = 0;
2297         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2298             /* Trim the end. */
2299             char *last;
2300             SV* float_real;
2301
2302             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2303                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2304             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2305
2306             if (flags & REXEC_SCREAM) {
2307                 last = screaminstr(sv, float_real, s - strbeg,
2308                                    end_shift, &scream_pos, 1); /* last one */
2309                 if (!last)
2310                     last = scream_olds; /* Only one occurrence. */
2311                 /* we may be pointing at the wrong string */
2312                 else if (RXp_MATCH_COPIED(prog))
2313                     s = strbeg + (s - SvPVX_const(sv));
2314             }
2315             else {
2316                 STRLEN len;
2317                 const char * const little = SvPV_const(float_real, len);
2318
2319                 if (SvTAIL(float_real)) {
2320                     if (memEQ(strend - len + 1, little, len - 1))
2321                         last = strend - len + 1;
2322                     else if (!multiline)
2323                         last = memEQ(strend - len, little, len)
2324                             ? strend - len : NULL;
2325                     else
2326                         goto find_last;
2327                 } else {
2328                   find_last:
2329                     if (len)
2330                         last = rninstr(s, strend, little, little + len);
2331                     else
2332                         last = strend;  /* matching "$" */
2333                 }
2334             }
2335             if (last == NULL) {
2336                 DEBUG_EXECUTE_r(
2337                     PerlIO_printf(Perl_debug_log,
2338                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2339                         PL_colors[4], PL_colors[5]));
2340                 goto phooey; /* Should not happen! */
2341             }
2342             dontbother = strend - last + prog->float_min_offset;
2343         }
2344         if (minlen && (dontbother < minlen))
2345             dontbother = minlen - 1;
2346         strend -= dontbother;              /* this one's always in bytes! */
2347         /* We don't know much -- general case. */
2348         if (utf8_target) {
2349             for (;;) {
2350                 if (regtry(&reginfo, &s))
2351                     goto got_it;
2352                 if (s >= strend)
2353                     break;
2354                 s += UTF8SKIP(s);
2355             };
2356         }
2357         else {
2358             do {
2359                 if (regtry(&reginfo, &s))
2360                     goto got_it;
2361             } while (s++ < strend);
2362         }
2363     }
2364
2365     /* Failure. */
2366     goto phooey;
2367
2368 got_it:
2369     Safefree(swap);
2370     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2371
2372     if (PL_reg_eval_set)
2373         restore_pos(aTHX_ prog);
2374     if (RXp_PAREN_NAMES(prog))
2375         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2376
2377     /* make sure $`, $&, $', and $digit will work later */
2378     if ( !(flags & REXEC_NOT_FIRST) ) {
2379         RX_MATCH_COPY_FREE(rx);
2380         if (flags & REXEC_COPY_STR) {
2381             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2382 #ifdef PERL_OLD_COPY_ON_WRITE
2383             if ((SvIsCOW(sv)
2384                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2385                 if (DEBUG_C_TEST) {
2386                     PerlIO_printf(Perl_debug_log,
2387                                   "Copy on write: regexp capture, type %d\n",
2388                                   (int) SvTYPE(sv));
2389                 }
2390                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2391                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2392                 assert (SvPOKp(prog->saved_copy));
2393             } else
2394 #endif
2395             {
2396                 RX_MATCH_COPIED_on(rx);
2397                 s = savepvn(strbeg, i);
2398                 prog->subbeg = s;
2399             }
2400             prog->sublen = i;
2401         }
2402         else {
2403             prog->subbeg = strbeg;
2404             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2405         }
2406     }
2407
2408     return 1;
2409
2410 phooey:
2411     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2412                           PL_colors[4], PL_colors[5]));
2413     if (PL_reg_eval_set)
2414         restore_pos(aTHX_ prog);
2415     if (swap) {
2416         /* we failed :-( roll it back */
2417         Safefree(prog->offs);
2418         prog->offs = swap;
2419     }
2420
2421     return 0;
2422 }
2423
2424
2425 /*
2426  - regtry - try match at specific point
2427  */
2428 STATIC I32                      /* 0 failure, 1 success */
2429 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2430 {
2431     dVAR;
2432     CHECKPOINT lastcp;
2433     REGEXP *const rx = reginfo->prog;
2434     regexp *const prog = (struct regexp *)SvANY(rx);
2435     RXi_GET_DECL(prog,progi);
2436     GET_RE_DEBUG_FLAGS_DECL;
2437
2438     PERL_ARGS_ASSERT_REGTRY;
2439
2440     reginfo->cutpoint=NULL;
2441
2442     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2443         MAGIC *mg;
2444
2445         PL_reg_eval_set = RS_init;
2446         DEBUG_EXECUTE_r(DEBUG_s(
2447             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2448                           (IV)(PL_stack_sp - PL_stack_base));
2449             ));
2450         SAVESTACK_CXPOS();
2451         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2452         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2453         SAVETMPS;
2454         /* Apparently this is not needed, judging by wantarray. */
2455         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2456            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2457
2458         if (reginfo->sv) {
2459             /* Make $_ available to executed code. */
2460             if (reginfo->sv != DEFSV) {
2461                 SAVE_DEFSV;
2462                 DEFSV_set(reginfo->sv);
2463             }
2464
2465             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2466                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2467                 /* prepare for quick setting of pos */
2468 #ifdef PERL_OLD_COPY_ON_WRITE
2469                 if (SvIsCOW(reginfo->sv))
2470                     sv_force_normal_flags(reginfo->sv, 0);
2471 #endif
2472                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2473                                  &PL_vtbl_mglob, NULL, 0);
2474                 mg->mg_len = -1;
2475             }
2476             PL_reg_magic    = mg;
2477             PL_reg_oldpos   = mg->mg_len;
2478             SAVEDESTRUCTOR_X(restore_pos, prog);
2479         }
2480         if (!PL_reg_curpm) {
2481             Newxz(PL_reg_curpm, 1, PMOP);
2482 #ifdef USE_ITHREADS
2483             {
2484                 SV* const repointer = &PL_sv_undef;
2485                 /* this regexp is also owned by the new PL_reg_curpm, which
2486                    will try to free it.  */
2487                 av_push(PL_regex_padav, repointer);
2488                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2489                 PL_regex_pad = AvARRAY(PL_regex_padav);
2490             }
2491 #endif
2492         }
2493 #ifdef USE_ITHREADS
2494         /* It seems that non-ithreads works both with and without this code.
2495            So for efficiency reasons it seems best not to have the code
2496            compiled when it is not needed.  */
2497         /* This is safe against NULLs: */
2498         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2499         /* PM_reg_curpm owns a reference to this regexp.  */
2500         ReREFCNT_inc(rx);
2501 #endif
2502         PM_SETRE(PL_reg_curpm, rx);
2503         PL_reg_oldcurpm = PL_curpm;
2504         PL_curpm = PL_reg_curpm;
2505         if (RXp_MATCH_COPIED(prog)) {
2506             /*  Here is a serious problem: we cannot rewrite subbeg,
2507                 since it may be needed if this match fails.  Thus
2508                 $` inside (?{}) could fail... */
2509             PL_reg_oldsaved = prog->subbeg;
2510             PL_reg_oldsavedlen = prog->sublen;
2511 #ifdef PERL_OLD_COPY_ON_WRITE
2512             PL_nrs = prog->saved_copy;
2513 #endif
2514             RXp_MATCH_COPIED_off(prog);
2515         }
2516         else
2517             PL_reg_oldsaved = NULL;
2518         prog->subbeg = PL_bostr;
2519         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2520     }
2521     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2522     prog->offs[0].start = *startpos - PL_bostr;
2523     PL_reginput = *startpos;
2524     PL_reglastparen = &prog->lastparen;
2525     PL_reglastcloseparen = &prog->lastcloseparen;
2526     prog->lastparen = 0;
2527     prog->lastcloseparen = 0;
2528     PL_regsize = 0;
2529     PL_regoffs = prog->offs;
2530     if (PL_reg_start_tmpl <= prog->nparens) {
2531         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2532         if(PL_reg_start_tmp)
2533             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2534         else
2535             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2536     }
2537
2538     /* XXXX What this code is doing here?!!!  There should be no need
2539        to do this again and again, PL_reglastparen should take care of
2540        this!  --ilya*/
2541
2542     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2543      * Actually, the code in regcppop() (which Ilya may be meaning by
2544      * PL_reglastparen), is not needed at all by the test suite
2545      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2546      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2547      * Meanwhile, this code *is* needed for the
2548      * above-mentioned test suite tests to succeed.  The common theme
2549      * on those tests seems to be returning null fields from matches.
2550      * --jhi updated by dapm */
2551 #if 1
2552     if (prog->nparens) {
2553         regexp_paren_pair *pp = PL_regoffs;
2554         register I32 i;
2555         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2556             ++pp;
2557             pp->start = -1;
2558             pp->end = -1;
2559         }
2560     }
2561 #endif
2562     REGCP_SET(lastcp);
2563     if (regmatch(reginfo, progi->program + 1)) {
2564         PL_regoffs[0].end = PL_reginput - PL_bostr;
2565         return 1;
2566     }
2567     if (reginfo->cutpoint)
2568         *startpos= reginfo->cutpoint;
2569     REGCP_UNWIND(lastcp);
2570     return 0;
2571 }
2572
2573
2574 #define sayYES goto yes
2575 #define sayNO goto no
2576 #define sayNO_SILENT goto no_silent
2577
2578 /* we dont use STMT_START/END here because it leads to
2579    "unreachable code" warnings, which are bogus, but distracting. */
2580 #define CACHEsayNO \
2581     if (ST.cache_mask) \
2582        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2583     sayNO
2584
2585 /* this is used to determine how far from the left messages like
2586    'failed...' are printed. It should be set such that messages
2587    are inline with the regop output that created them.
2588 */
2589 #define REPORT_CODE_OFF 32
2590
2591
2592 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2593 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2594
2595 #define SLAB_FIRST(s) (&(s)->states[0])
2596 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2597
2598 /* grab a new slab and return the first slot in it */
2599
2600 STATIC regmatch_state *
2601 S_push_slab(pTHX)
2602 {
2603 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2604     dMY_CXT;
2605 #endif
2606     regmatch_slab *s = PL_regmatch_slab->next;
2607     if (!s) {
2608         Newx(s, 1, regmatch_slab);
2609         s->prev = PL_regmatch_slab;
2610         s->next = NULL;
2611         PL_regmatch_slab->next = s;
2612     }
2613     PL_regmatch_slab = s;
2614     return SLAB_FIRST(s);
2615 }
2616
2617
2618 /* push a new state then goto it */
2619
2620 #define PUSH_STATE_GOTO(state, node) \
2621     scan = node; \
2622     st->resume_state = state; \
2623     goto push_state;
2624
2625 /* push a new state with success backtracking, then goto it */
2626
2627 #define PUSH_YES_STATE_GOTO(state, node) \
2628     scan = node; \
2629     st->resume_state = state; \
2630     goto push_yes_state;
2631
2632
2633
2634 /*
2635
2636 regmatch() - main matching routine
2637
2638 This is basically one big switch statement in a loop. We execute an op,
2639 set 'next' to point the next op, and continue. If we come to a point which
2640 we may need to backtrack to on failure such as (A|B|C), we push a
2641 backtrack state onto the backtrack stack. On failure, we pop the top
2642 state, and re-enter the loop at the state indicated. If there are no more
2643 states to pop, we return failure.
2644
2645 Sometimes we also need to backtrack on success; for example /A+/, where
2646 after successfully matching one A, we need to go back and try to
2647 match another one; similarly for lookahead assertions: if the assertion
2648 completes successfully, we backtrack to the state just before the assertion
2649 and then carry on.  In these cases, the pushed state is marked as
2650 'backtrack on success too'. This marking is in fact done by a chain of
2651 pointers, each pointing to the previous 'yes' state. On success, we pop to
2652 the nearest yes state, discarding any intermediate failure-only states.
2653 Sometimes a yes state is pushed just to force some cleanup code to be
2654 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2655 it to free the inner regex.
2656
2657 Note that failure backtracking rewinds the cursor position, while
2658 success backtracking leaves it alone.
2659
2660 A pattern is complete when the END op is executed, while a subpattern
2661 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2662 ops trigger the "pop to last yes state if any, otherwise return true"
2663 behaviour.
2664
2665 A common convention in this function is to use A and B to refer to the two
2666 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2667 the subpattern to be matched possibly multiple times, while B is the entire
2668 rest of the pattern. Variable and state names reflect this convention.
2669
2670 The states in the main switch are the union of ops and failure/success of
2671 substates associated with with that op.  For example, IFMATCH is the op
2672 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2673 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2674 successfully matched A and IFMATCH_A_fail is a state saying that we have
2675 just failed to match A. Resume states always come in pairs. The backtrack
2676 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2677 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2678 on success or failure.
2679
2680 The struct that holds a backtracking state is actually a big union, with
2681 one variant for each major type of op. The variable st points to the
2682 top-most backtrack struct. To make the code clearer, within each
2683 block of code we #define ST to alias the relevant union.
2684
2685 Here's a concrete example of a (vastly oversimplified) IFMATCH
2686 implementation:
2687
2688     switch (state) {
2689     ....
2690
2691 #define ST st->u.ifmatch
2692
2693     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2694         ST.foo = ...; // some state we wish to save
2695         ...
2696         // push a yes backtrack state with a resume value of
2697         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2698         // first node of A:
2699         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2700         // NOTREACHED
2701
2702     case IFMATCH_A: // we have successfully executed A; now continue with B
2703         next = B;
2704         bar = ST.foo; // do something with the preserved value
2705         break;
2706
2707     case IFMATCH_A_fail: // A failed, so the assertion failed
2708         ...;   // do some housekeeping, then ...
2709         sayNO; // propagate the failure
2710
2711 #undef ST
2712
2713     ...
2714     }
2715
2716 For any old-timers reading this who are familiar with the old recursive
2717 approach, the code above is equivalent to:
2718
2719     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2720     {
2721         int foo = ...
2722         ...
2723         if (regmatch(A)) {
2724             next = B;
2725             bar = foo;
2726             break;
2727         }
2728         ...;   // do some housekeeping, then ...
2729         sayNO; // propagate the failure
2730     }
2731
2732 The topmost backtrack state, pointed to by st, is usually free. If you
2733 want to claim it, populate any ST.foo fields in it with values you wish to
2734 save, then do one of
2735
2736         PUSH_STATE_GOTO(resume_state, node);
2737         PUSH_YES_STATE_GOTO(resume_state, node);
2738
2739 which sets that backtrack state's resume value to 'resume_state', pushes a
2740 new free entry to the top of the backtrack stack, then goes to 'node'.
2741 On backtracking, the free slot is popped, and the saved state becomes the
2742 new free state. An ST.foo field in this new top state can be temporarily
2743 accessed to retrieve values, but once the main loop is re-entered, it
2744 becomes available for reuse.
2745
2746 Note that the depth of the backtrack stack constantly increases during the
2747 left-to-right execution of the pattern, rather than going up and down with
2748 the pattern nesting. For example the stack is at its maximum at Z at the
2749 end of the pattern, rather than at X in the following:
2750
2751     /(((X)+)+)+....(Y)+....Z/
2752
2753 The only exceptions to this are lookahead/behind assertions and the cut,
2754 (?>A), which pop all the backtrack states associated with A before
2755 continuing.
2756
2757 Bascktrack state structs are allocated in slabs of about 4K in size.
2758 PL_regmatch_state and st always point to the currently active state,
2759 and PL_regmatch_slab points to the slab currently containing
2760 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2761 allocated, and is never freed until interpreter destruction. When the slab
2762 is full, a new one is allocated and chained to the end. At exit from
2763 regmatch(), slabs allocated since entry are freed.
2764
2765 */
2766
2767
2768 #define DEBUG_STATE_pp(pp)                                  \
2769     DEBUG_STATE_r({                                         \
2770         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2771         PerlIO_printf(Perl_debug_log,                       \
2772             "    %*s"pp" %s%s%s%s%s\n",                     \
2773             depth*2, "",                                    \
2774             PL_reg_name[st->resume_state],                     \
2775             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2776             ((st==yes_state) ? "Y" : ""),                   \
2777             ((st==mark_state) ? "M" : ""),                  \
2778             ((st==yes_state||st==mark_state) ? "]" : "")    \
2779         );                                                  \
2780     });
2781
2782
2783 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2784
2785 #ifdef DEBUGGING
2786
2787 STATIC void
2788 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2789     const char *start, const char *end, const char *blurb)
2790 {
2791     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2792
2793     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2794
2795     if (!PL_colorset)
2796             reginitcolors();
2797     {
2798         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2799             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2800
2801         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2802             start, end - start, 60);
2803
2804         PerlIO_printf(Perl_debug_log,
2805             "%s%s REx%s %s against %s\n",
2806                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2807
2808         if (utf8_target||utf8_pat)
2809             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2810                 utf8_pat ? "pattern" : "",
2811                 utf8_pat && utf8_target ? " and " : "",
2812                 utf8_target ? "string" : ""
2813             );
2814     }
2815 }
2816
2817 STATIC void
2818 S_dump_exec_pos(pTHX_ const char *locinput,
2819                       const regnode *scan,
2820                       const char *loc_regeol,
2821                       const char *loc_bostr,
2822                       const char *loc_reg_starttry,
2823                       const bool utf8_target)
2824 {
2825     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2826     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2827     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2828     /* The part of the string before starttry has one color
2829        (pref0_len chars), between starttry and current
2830        position another one (pref_len - pref0_len chars),
2831        after the current position the third one.
2832        We assume that pref0_len <= pref_len, otherwise we
2833        decrease pref0_len.  */
2834     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2835         ? (5 + taill) - l : locinput - loc_bostr;
2836     int pref0_len;
2837
2838     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2839
2840     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2841         pref_len++;
2842     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2843     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2844         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2845               ? (5 + taill) - pref_len : loc_regeol - locinput);
2846     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2847         l--;
2848     if (pref0_len < 0)
2849         pref0_len = 0;
2850     if (pref0_len > pref_len)
2851         pref0_len = pref_len;
2852     {
2853         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2854
2855         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2856             (locinput - pref_len),pref0_len, 60, 4, 5);
2857
2858         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2859                     (locinput - pref_len + pref0_len),
2860                     pref_len - pref0_len, 60, 2, 3);
2861
2862         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2863                     locinput, loc_regeol - locinput, 10, 0, 1);
2864
2865         const STRLEN tlen=len0+len1+len2;
2866         PerlIO_printf(Perl_debug_log,
2867                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2868                     (IV)(locinput - loc_bostr),
2869                     len0, s0,
2870                     len1, s1,
2871                     (docolor ? "" : "> <"),
2872                     len2, s2,
2873                     (int)(tlen > 19 ? 0 :  19 - tlen),
2874                     "");
2875     }
2876 }
2877
2878 #endif
2879
2880 /* reg_check_named_buff_matched()
2881  * Checks to see if a named buffer has matched. The data array of
2882  * buffer numbers corresponding to the buffer is expected to reside
2883  * in the regexp->data->data array in the slot stored in the ARG() of
2884  * node involved. Note that this routine doesn't actually care about the
2885  * name, that information is not preserved from compilation to execution.
2886  * Returns the index of the leftmost defined buffer with the given name
2887  * or 0 if non of the buffers matched.
2888  */
2889 STATIC I32
2890 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2891 {
2892     I32 n;
2893     RXi_GET_DECL(rex,rexi);
2894     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2895     I32 *nums=(I32*)SvPVX(sv_dat);
2896
2897     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2898
2899     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2900         if ((I32)*PL_reglastparen >= nums[n] &&
2901             PL_regoffs[nums[n]].end != -1)
2902         {
2903             return nums[n];
2904         }
2905     }
2906     return 0;
2907 }
2908
2909
2910 /* free all slabs above current one  - called during LEAVE_SCOPE */
2911
2912 STATIC void
2913 S_clear_backtrack_stack(pTHX_ void *p)
2914 {
2915     regmatch_slab *s = PL_regmatch_slab->next;
2916     PERL_UNUSED_ARG(p);
2917
2918     if (!s)
2919         return;
2920     PL_regmatch_slab->next = NULL;
2921     while (s) {
2922         regmatch_slab * const osl = s;
2923         s = s->next;
2924         Safefree(osl);
2925     }
2926 }
2927
2928
2929 #define SETREX(Re1,Re2) \
2930     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2931     Re1 = (Re2)
2932
2933 STATIC I32                      /* 0 failure, 1 success */
2934 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2935 {
2936 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2937     dMY_CXT;
2938 #endif
2939     dVAR;
2940     register const bool utf8_target = PL_reg_match_utf8;
2941     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2942     REGEXP *rex_sv = reginfo->prog;
2943     regexp *rex = (struct regexp *)SvANY(rex_sv);
2944     RXi_GET_DECL(rex,rexi);
2945     I32 oldsave;
2946     /* the current state. This is a cached copy of PL_regmatch_state */
2947     register regmatch_state *st;
2948     /* cache heavy used fields of st in registers */
2949     register regnode *scan;
2950     register regnode *next;
2951     register U32 n = 0; /* general value; init to avoid compiler warning */
2952     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2953     register char *locinput = PL_reginput;
2954     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2955
2956     bool result = 0;        /* return value of S_regmatch */
2957     int depth = 0;          /* depth of backtrack stack */
2958     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2959     const U32 max_nochange_depth =
2960         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2961         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2962     regmatch_state *yes_state = NULL; /* state to pop to on success of
2963                                                             subpattern */
2964     /* mark_state piggy backs on the yes_state logic so that when we unwind
2965        the stack on success we can update the mark_state as we go */
2966     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2967     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2968     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2969     U32 state_num;
2970     bool no_final = 0;      /* prevent failure from backtracking? */
2971     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2972     char *startpoint = PL_reginput;
2973     SV *popmark = NULL;     /* are we looking for a mark? */
2974     SV *sv_commit = NULL;   /* last mark name seen in failure */
2975     SV *sv_yes_mark = NULL; /* last mark name we have seen
2976                                during a successfull match */
2977     U32 lastopen = 0;       /* last open we saw */
2978     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2979     SV* const oreplsv = GvSV(PL_replgv);
2980     /* these three flags are set by various ops to signal information to
2981      * the very next op. They have a useful lifetime of exactly one loop
2982      * iteration, and are not preserved or restored by state pushes/pops
2983      */
2984     bool sw = 0;            /* the condition value in (?(cond)a|b) */
2985     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
2986     int logical = 0;        /* the following EVAL is:
2987                                 0: (?{...})
2988                                 1: (?(?{...})X|Y)
2989                                 2: (??{...})
2990                                or the following IFMATCH/UNLESSM is:
2991                                 false: plain (?=foo)
2992                                 true:  used as a condition: (?(?=foo))
2993                             */
2994 #ifdef DEBUGGING
2995     GET_RE_DEBUG_FLAGS_DECL;
2996 #endif
2997
2998     PERL_ARGS_ASSERT_REGMATCH;
2999
3000     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3001             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3002     }));
3003     /* on first ever call to regmatch, allocate first slab */
3004     if (!PL_regmatch_slab) {
3005         Newx(PL_regmatch_slab, 1, regmatch_slab);
3006         PL_regmatch_slab->prev = NULL;
3007         PL_regmatch_slab->next = NULL;
3008         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3009     }
3010
3011     oldsave = PL_savestack_ix;
3012     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3013     SAVEVPTR(PL_regmatch_slab);
3014     SAVEVPTR(PL_regmatch_state);
3015
3016     /* grab next free state slot */
3017     st = ++PL_regmatch_state;
3018     if (st >  SLAB_LAST(PL_regmatch_slab))
3019         st = PL_regmatch_state = S_push_slab(aTHX);
3020
3021     /* Note that nextchr is a byte even in UTF */
3022     nextchr = UCHARAT(locinput);
3023     scan = prog;
3024     while (scan != NULL) {
3025
3026         DEBUG_EXECUTE_r( {
3027             SV * const prop = sv_newmortal();
3028             regnode *rnext=regnext(scan);
3029             DUMP_EXEC_POS( locinput, scan, utf8_target );
3030             regprop(rex, prop, scan);
3031
3032             PerlIO_printf(Perl_debug_log,
3033                     "%3"IVdf":%*s%s(%"IVdf")\n",
3034                     (IV)(scan - rexi->program), depth*2, "",
3035                     SvPVX_const(prop),
3036                     (PL_regkind[OP(scan)] == END || !rnext) ?
3037                         0 : (IV)(rnext - rexi->program));
3038         });
3039
3040         next = scan + NEXT_OFF(scan);
3041         if (next == scan)
3042             next = NULL;
3043         state_num = OP(scan);
3044
3045       reenter_switch:
3046
3047         assert(PL_reglastparen == &rex->lastparen);
3048         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3049         assert(PL_regoffs == rex->offs);
3050
3051         switch (state_num) {
3052         case BOL:
3053             if (locinput == PL_bostr)
3054             {
3055                 /* reginfo->till = reginfo->bol; */
3056                 break;
3057             }
3058             sayNO;
3059         case MBOL:
3060             if (locinput == PL_bostr ||
3061                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3062             {
3063                 break;
3064             }
3065             sayNO;
3066         case SBOL:
3067             if (locinput == PL_bostr)
3068                 break;
3069             sayNO;
3070         case GPOS:
3071             if (locinput == reginfo->ganch)
3072                 break;
3073             sayNO;
3074
3075         case KEEPS:
3076             /* update the startpoint */
3077             st->u.keeper.val = PL_regoffs[0].start;
3078             PL_reginput = locinput;
3079             PL_regoffs[0].start = locinput - PL_bostr;
3080             PUSH_STATE_GOTO(KEEPS_next, next);
3081             /*NOT-REACHED*/
3082         case KEEPS_next_fail:
3083             /* rollback the start point change */
3084             PL_regoffs[0].start = st->u.keeper.val;
3085             sayNO_SILENT;
3086             /*NOT-REACHED*/
3087         case EOL:
3088                 goto seol;
3089         case MEOL:
3090             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3091                 sayNO;
3092             break;
3093         case SEOL:
3094           seol:
3095             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3096                 sayNO;
3097             if (PL_regeol - locinput > 1)
3098                 sayNO;
3099             break;
3100         case EOS:
3101             if (PL_regeol != locinput)
3102                 sayNO;
3103             break;
3104         case SANY:
3105             if (!nextchr && locinput >= PL_regeol)
3106                 sayNO;
3107             if (utf8_target) {
3108                 locinput += PL_utf8skip[nextchr];
3109                 if (locinput > PL_regeol)
3110                     sayNO;
3111                 nextchr = UCHARAT(locinput);
3112             }
3113             else
3114                 nextchr = UCHARAT(++locinput);
3115             break;
3116         case CANY:
3117             if (!nextchr && locinput >= PL_regeol)
3118                 sayNO;
3119             nextchr = UCHARAT(++locinput);
3120             break;
3121         case REG_ANY:
3122             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3123                 sayNO;
3124             if (utf8_target) {
3125                 locinput += PL_utf8skip[nextchr];
3126                 if (locinput > PL_regeol)
3127                     sayNO;
3128                 nextchr = UCHARAT(locinput);
3129             }
3130             else
3131                 nextchr = UCHARAT(++locinput);
3132             break;
3133
3134 #undef  ST
3135 #define ST st->u.trie
3136         case TRIEC:
3137             /* In this case the charclass data is available inline so
3138                we can fail fast without a lot of extra overhead.
3139              */
3140             if (scan->flags == EXACT || !utf8_target) {
3141                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3142                     DEBUG_EXECUTE_r(
3143                         PerlIO_printf(Perl_debug_log,
3144                                   "%*s  %sfailed to match trie start class...%s\n",
3145                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3146                     );
3147                     sayNO_SILENT;
3148                     /* NOTREACHED */
3149                 }
3150             }
3151             /* FALL THROUGH */
3152         case TRIE:
3153             /* the basic plan of execution of the trie is:
3154              * At the beginning, run though all the states, and
3155              * find the longest-matching word. Also remember the position
3156              * of the shortest matching word. For example, this pattern:
3157              *    1  2 3 4    5
3158              *    ab|a|x|abcd|abc
3159              * when matched against the string "abcde", will generate
3160              * accept states for all words except 3, with the longest
3161              * matching word being 4, and the shortest being 1 (with
3162              * the position being after char 1 of the string).
3163              *
3164              * Then for each matching word, in word order (i.e. 1,2,4,5),
3165              * we run the remainder of the pattern; on each try setting
3166              * the current position to the character following the word,
3167              * returning to try the next word on failure.
3168              *
3169              * We avoid having to build a list of words at runtime by
3170              * using a compile-time structure, wordinfo[].prev, which
3171              * gives, for each word, the previous accepting word (if any).
3172              * In the case above it would contain the mappings 1->2, 2->0,
3173              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3174              * the longest word (4 above), a list of all words, by
3175              * following the list of prev pointers; this gives us the
3176              * unordered list 4,5,1,2. Then given the current word we have
3177              * just tried, we can go through the list and find the
3178              * next-biggest word to try (so if we just failed on word 2,
3179              * the next in the list is 4).
3180              *
3181              * Since at runtime we don't record the matching position in
3182              * the string for each word, we have to work that out for
3183              * each word we're about to process. The wordinfo table holds
3184              * the character length of each word; given that we recorded
3185              * at the start: the position of the shortest word and its
3186              * length in chars, we just need to move the pointer the
3187              * difference between the two char lengths. Depending on
3188              * Unicode status and folding, that's cheap or expensive.
3189              *
3190              * This algorithm is optimised for the case where are only a
3191              * small number of accept states, i.e. 0,1, or maybe 2.
3192              * With lots of accepts states, and having to try all of them,
3193              * it becomes quadratic on number of accept states to find all
3194              * the next words.
3195              */
3196
3197             {
3198                 /* what type of TRIE am I? (utf8 makes this contextual) */
3199                 DECL_TRIE_TYPE(scan);
3200
3201                 /* what trie are we using right now */
3202                 reg_trie_data * const trie
3203                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3204                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3205                 U32 state = trie->startstate;
3206
3207                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3208                     !TRIE_BITMAP_TEST(trie,*locinput)
3209                 ) {
3210                     if (trie->states[ state ].wordnum) {
3211                          DEBUG_EXECUTE_r(
3212                             PerlIO_printf(Perl_debug_log,
3213                                           "%*s  %smatched empty string...%s\n",
3214                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3215                         );
3216                         break;
3217                     } else {
3218                         DEBUG_EXECUTE_r(
3219                             PerlIO_printf(Perl_debug_log,
3220                                           "%*s  %sfailed to match trie start class...%s\n",
3221                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3222                         );
3223                         sayNO_SILENT;
3224                    }
3225                 }
3226
3227             {
3228                 U8 *uc = ( U8* )locinput;
3229
3230                 STRLEN len = 0;
3231                 STRLEN foldlen = 0;
3232                 U8 *uscan = (U8*)NULL;
3233                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3234                 U32 charcount = 0; /* how many input chars we have matched */
3235                 U32 accepted = 0; /* have we seen any accepting states? */
3236
3237                 ST.B = next;
3238                 ST.jump = trie->jump;
3239                 ST.me = scan;
3240                 ST.firstpos = NULL;
3241                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3242                 ST.nextword = 0;
3243
3244                 /* fully traverse the TRIE; note the position of the
3245                    shortest accept state and the wordnum of the longest
3246                    accept state */
3247
3248                 while ( state && uc <= (U8*)PL_regeol ) {
3249                     U32 base = trie->states[ state ].trans.base;
3250                     UV uvc = 0;
3251                     U16 charid = 0;
3252                     U16 wordnum;
3253                     wordnum = trie->states[ state ].wordnum;
3254
3255                     if (wordnum) { /* it's an accept state */
3256                         if (!accepted) {
3257                             accepted = 1;
3258                             /* record first match position */
3259                             if (ST.longfold) {
3260                                 ST.firstpos = (U8*)locinput;
3261                                 ST.firstchars = 0;
3262                             }
3263                             else {
3264                                 ST.firstpos = uc;
3265                                 ST.firstchars = charcount;
3266                             }
3267                         }
3268                         if (!ST.nextword || wordnum < ST.nextword)
3269                             ST.nextword = wordnum;
3270                         ST.topword = wordnum;
3271                     }
3272
3273                     DEBUG_TRIE_EXECUTE_r({
3274                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3275                                 PerlIO_printf( Perl_debug_log,
3276                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3277                                     2+depth * 2, "", PL_colors[4],
3278                                     (UV)state, (accepted ? 'Y' : 'N'));
3279                     });
3280
3281                     /* read a char and goto next state */
3282                     if ( base ) {
3283                         I32 offset;
3284                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3285                                              uscan, len, uvc, charid, foldlen,
3286                                              foldbuf, uniflags);
3287                         charcount++;
3288                         if (foldlen>0)
3289                             ST.longfold = TRUE;
3290                         if (charid &&
3291                              ( ((offset =
3292                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3293
3294                              && ((U32)offset < trie->lasttrans)
3295                              && trie->trans[offset].check == state)
3296                         {
3297                             state = trie->trans[offset].next;
3298                         }
3299                         else {
3300                             state = 0;
3301                         }
3302                         uc += len;
3303
3304                     }
3305                     else {
3306                         state = 0;
3307                     }
3308                     DEBUG_TRIE_EXECUTE_r(
3309                         PerlIO_printf( Perl_debug_log,
3310                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3311                             charid, uvc, (UV)state, PL_colors[5] );
3312                     );
3313                 }
3314                 if (!accepted)
3315                    sayNO;
3316
3317                 /* calculate total number of accept states */
3318                 {
3319                     U16 w = ST.topword;
3320                     accepted = 0;
3321                     while (w) {
3322                         w = trie->wordinfo[w].prev;
3323                         accepted++;
3324                     }
3325                     ST.accepted = accepted;
3326                 }
3327
3328                 DEBUG_EXECUTE_r(
3329                     PerlIO_printf( Perl_debug_log,
3330                         "%*s  %sgot %"IVdf" possible matches%s\n",
3331                         REPORT_CODE_OFF + depth * 2, "",
3332                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3333                 );
3334                 goto trie_first_try; /* jump into the fail handler */
3335             }}
3336             /* NOTREACHED */
3337
3338         case TRIE_next_fail: /* we failed - try next alternative */
3339             if ( ST.jump) {
3340                 REGCP_UNWIND(ST.cp);
3341                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3342                     PL_regoffs[n].end = -1;
3343                 *PL_reglastparen = n;
3344             }
3345             if (!--ST.accepted) {
3346                 DEBUG_EXECUTE_r({
3347                     PerlIO_printf( Perl_debug_log,
3348                         "%*s  %sTRIE failed...%s\n",
3349                         REPORT_CODE_OFF+depth*2, "",
3350                         PL_colors[4],
3351                         PL_colors[5] );
3352                 });
3353                 sayNO_SILENT;
3354             }
3355             {
3356                 /* Find next-highest word to process.  Note that this code
3357                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3358                 register U16 min = 0;
3359                 register U16 word;
3360                 register U16 const nextword = ST.nextword;
3361                 register reg_trie_wordinfo * const wordinfo
3362                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3363                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3364                     if (word > nextword && (!min || word < min))
3365                         min = word;
3366                 }
3367                 ST.nextword = min;
3368             }
3369
3370           trie_first_try:
3371             if (do_cutgroup) {
3372                 do_cutgroup = 0;
3373                 no_final = 0;
3374             }
3375
3376             if ( ST.jump) {
3377                 ST.lastparen = *PL_reglastparen;
3378                 REGCP_SET(ST.cp);
3379             }
3380
3381             /* find start char of end of current word */
3382             {
3383                 U32 chars; /* how many chars to skip */
3384                 U8 *uc = ST.firstpos;
3385                 reg_trie_data * const trie
3386                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3387
3388                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3389                             >=  ST.firstchars);
3390                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3391                             - ST.firstchars;
3392
3393                 if (ST.longfold) {
3394                     /* the hard option - fold each char in turn and find
3395                      * its folded length (which may be different */
3396                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3397                     STRLEN foldlen;
3398                     STRLEN len;
3399                     UV uvc;
3400                     U8 *uscan;
3401
3402                     while (chars) {
3403                         if (utf8_target) {
3404                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3405                                                     uniflags);
3406                             uc += len;
3407                         }
3408                         else {
3409                             uvc = *uc;
3410                             uc++;
3411                         }
3412                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3413                         uscan = foldbuf;
3414                         while (foldlen) {
3415                             if (!--chars)
3416                                 break;
3417                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3418                                             uniflags);
3419                             uscan += len;
3420                             foldlen -= len;
3421                         }
3422                     }
3423                 }
3424                 else {
3425                     if (utf8_target)
3426                         while (chars--)
3427                             uc += UTF8SKIP(uc);
3428                     else
3429                         uc += chars;
3430                 }
3431                 PL_reginput = (char *)uc;
3432             }
3433
3434             scan = (ST.jump && ST.jump[ST.nextword])
3435                         ? ST.me + ST.jump[ST.nextword]
3436                         : ST.B;
3437
3438             DEBUG_EXECUTE_r({
3439                 PerlIO_printf( Perl_debug_log,
3440                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3441                     REPORT_CODE_OFF+depth*2, "",
3442                     PL_colors[4],
3443                     ST.nextword,
3444                     PL_colors[5]
3445                     );
3446             });
3447
3448             if (ST.accepted > 1 || has_cutgroup) {
3449                 PUSH_STATE_GOTO(TRIE_next, scan);
3450                 /* NOTREACHED */
3451             }
3452             /* only one choice left - just continue */
3453             DEBUG_EXECUTE_r({
3454                 AV *const trie_words
3455                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3456                 SV ** const tmp = av_fetch( trie_words,
3457                     ST.nextword-1, 0 );
3458                 SV *sv= tmp ? sv_newmortal() : NULL;
3459
3460                 PerlIO_printf( Perl_debug_log,
3461                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3462                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3463                     ST.nextword,
3464                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3465                             PL_colors[0], PL_colors[1],
3466                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3467                         )
3468                     : "not compiled under -Dr",
3469                     PL_colors[5] );
3470             });
3471
3472             locinput = PL_reginput;
3473             nextchr = UCHARAT(locinput);
3474             continue; /* execute rest of RE */
3475             /* NOTREACHED */
3476 #undef  ST
3477
3478         case EXACT: {
3479             char *s = STRING(scan);
3480             ln = STR_LEN(scan);
3481             if (utf8_target != UTF_PATTERN) {
3482                 /* The target and the pattern have differing utf8ness. */
3483                 char *l = locinput;
3484                 const char * const e = s + ln;
3485
3486                 if (utf8_target) {
3487                     /* The target is utf8, the pattern is not utf8. */
3488                     while (s < e) {
3489                         STRLEN ulen;
3490                         if (l >= PL_regeol)
3491                              sayNO;
3492                         if (NATIVE_TO_UNI(*(U8*)s) !=
3493                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3494                                             uniflags))
3495                              sayNO;
3496                         l += ulen;
3497                         s ++;
3498                     }
3499                 }
3500                 else {
3501                     /* The target is not utf8, the pattern is utf8. */
3502                     while (s < e) {
3503                         STRLEN ulen;
3504                         if (l >= PL_regeol)
3505                             sayNO;
3506                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3507                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3508                                            uniflags))
3509                             sayNO;
3510                         s += ulen;
3511                         l ++;
3512                     }
3513                 }
3514                 locinput = l;
3515                 nextchr = UCHARAT(locinput);
3516                 break;
3517             }
3518             /* The target and the pattern have the same utf8ness. */
3519             /* Inline the first character, for speed. */
3520             if (UCHARAT(s) != nextchr)
3521                 sayNO;
3522             if (PL_regeol - locinput < ln)
3523                 sayNO;
3524             if (ln > 1 && memNE(s, locinput, ln))
3525                 sayNO;
3526             locinput += ln;
3527             nextchr = UCHARAT(locinput);
3528             break;
3529             }
3530         case EXACTFL:
3531             PL_reg_flags |= RF_tainted;
3532             /* FALL THROUGH */
3533         case EXACTF: {
3534             char * const s = STRING(scan);
3535             ln = STR_LEN(scan);
3536
3537             if (utf8_target || UTF_PATTERN) {
3538               /* Either target or the pattern are utf8. */
3539                 const char * const l = locinput;
3540                 char *e = PL_regeol;
3541
3542                 if (! foldEQ_utf8(s, 0,  ln, cBOOL(UTF_PATTERN),
3543                                l, &e, 0,  utf8_target)) {
3544                      /* One more case for the sharp s:
3545                       * pack("U0U*", 0xDF) =~ /ss/i,
3546                       * the 0xC3 0x9F are the UTF-8
3547                       * byte sequence for the U+00DF. */
3548
3549                      if (!(utf8_target &&
3550                            toLOWER(s[0]) == 's' &&
3551                            ln >= 2 &&
3552                            toLOWER(s[1]) == 's' &&
3553                            (U8)l[0] == 0xC3 &&
3554                            e - l >= 2 &&
3555                            (U8)l[1] == 0x9F))
3556                           sayNO;
3557                 }
3558                 locinput = e;
3559                 nextchr = UCHARAT(locinput);
3560                 break;
3561             }
3562
3563             /* Neither the target and the pattern are utf8. */
3564
3565             /* Inline the first character, for speed. */
3566             if (UCHARAT(s) != nextchr &&
3567                 UCHARAT(s) != ((OP(scan) == EXACTF)
3568                                ? PL_fold : PL_fold_locale)[nextchr])
3569                 sayNO;
3570             if (PL_regeol - locinput < ln)
3571                 sayNO;
3572             if (ln > 1 && (OP(scan) == EXACTF
3573                            ? ! foldEQ(s, locinput, ln)
3574                            : ! foldEQ_locale(s, locinput, ln)))
3575                 sayNO;
3576             locinput += ln;
3577             nextchr = UCHARAT(locinput);
3578             break;
3579             }
3580         case BOUNDL:
3581         case NBOUNDL:
3582             PL_reg_flags |= RF_tainted;
3583             /* FALL THROUGH */
3584         case BOUND:
3585         case NBOUND:
3586             /* was last char in word? */
3587             if (utf8_target) {
3588                 if (locinput == PL_bostr)
3589                     ln = '\n';
3590                 else {
3591                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3592
3593                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3594                 }
3595                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3596                     ln = isALNUM_uni(ln);
3597                     LOAD_UTF8_CHARCLASS_ALNUM();
3598                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3599                 }
3600                 else {
3601                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3602                     n = isALNUM_LC_utf8((U8*)locinput);
3603                 }
3604             }
3605             else {
3606                 ln = (locinput != PL_bostr) ?
3607                     UCHARAT(locinput - 1) : '\n';
3608                 if (FLAGS(scan) & USE_UNI) {
3609
3610                     /* Here, can't be BOUNDL or NBOUNDL because they never set
3611                      * the flags to USE_UNI */
3612                     ln = isWORDCHAR_L1(ln);
3613                     n = isWORDCHAR_L1(nextchr);
3614                 }
3615                 else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3616                     ln = isALNUM(ln);
3617                     n = isALNUM(nextchr);
3618                 }
3619                 else {
3620                     ln = isALNUM_LC(ln);
3621                     n = isALNUM_LC(nextchr);
3622                 }
3623             }
3624             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3625                                     OP(scan) == BOUNDL))
3626                     sayNO;
3627             break;
3628         case ANYOF:
3629             if (utf8_target) {
3630                 STRLEN inclasslen = PL_regeol - locinput;
3631
3632                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3633                     goto anyof_fail;
3634                 if (locinput >= PL_regeol)
3635                     sayNO;
3636                 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3637                 nextchr = UCHARAT(locinput);
3638                 break;
3639             }
3640             else {
3641                 if (nextchr < 0)
3642                     nextchr = UCHARAT(locinput);
3643                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3644                     goto anyof_fail;
3645                 if (!nextchr && locinput >= PL_regeol)
3646                     sayNO;
3647                 nextchr = UCHARAT(++locinput);
3648                 break;
3649             }
3650         anyof_fail:
3651             /* If we might have the case of the German sharp s
3652              * in a casefolding Unicode character class. */
3653
3654             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3655                  locinput += SHARP_S_SKIP;
3656                  nextchr = UCHARAT(locinput);
3657             }
3658             else
3659                  sayNO;
3660             break;
3661         /* Special char classes - The defines start on line 129 or so */
3662         CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3663         CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3664
3665         CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3666         CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3667
3668         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3669         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3670
3671         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3672                        a Unicode extended Grapheme Cluster */
3673             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3674               extended Grapheme Cluster is:
3675
3676                CR LF
3677                | Prepend* Begin Extend*
3678                | .
3679
3680                Begin is (Hangul-syllable | ! Control)
3681                Extend is (Grapheme_Extend | Spacing_Mark)
3682                Control is [ GCB_Control CR LF ]
3683
3684                The discussion below shows how the code for CLUMP is derived
3685                from this regex.  Note that most of these concepts are from
3686                property values of the Grapheme Cluster Boundary (GCB) property.
3687                No code point can have multiple property values for a given
3688                property.  Thus a code point in Prepend can't be in Control, but
3689                it must be in !Control.  This is why Control above includes
3690                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3691                property separately, and so can't be in GCB_Control, even though
3692                they logically are controls.  Control is not the same as gc=cc,
3693                but includes format and other characters as well.
3694
3695                The Unicode definition of Hangul-syllable is:
3696                    L+
3697                    | (L* ( ( V | LV ) V* | LVT ) T*)
3698                    | T+
3699                   )
3700                Each of these is a value for the GCB property, and hence must be
3701                disjoint, so the order they are tested is immaterial, so the
3702                above can safely be changed to
3703                    T+
3704                    | L+
3705                    | (L* ( LVT | ( V | LV ) V*) T*)
3706
3707                The last two terms can be combined like this:
3708                    L* ( L
3709                         | (( LVT | ( V | LV ) V*) T*))
3710
3711                And refactored into this:
3712                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3713
3714                That means that if we have seen any L's at all we can quit
3715                there, but if the next character is a LVT, a V or and LV we
3716                should keep going.
3717
3718                There is a subtlety with Prepend* which showed up in testing.
3719                Note that the Begin, and only the Begin is required in:
3720                 | Prepend* Begin Extend*
3721                Also, Begin contains '! Control'.  A Prepend must be a '!
3722                Control', which means it must be a Begin.  What it comes down to
3723                is that if we match Prepend* and then find no suitable Begin
3724                afterwards, that if we backtrack the last Prepend, that one will
3725                be a suitable Begin.
3726             */
3727
3728             if (locinput >= PL_regeol)
3729                 sayNO;
3730             if  (! utf8_target) {
3731
3732                 /* Match either CR LF  or '.', as all the other possibilities
3733                  * require utf8 */
3734                 locinput++;         /* Match the . or CR */
3735                 if (nextchr == '\r'
3736                     && locinput < PL_regeol
3737                     && UCHARAT(locinput) == '\n') locinput++;
3738             }
3739             else {
3740
3741                 /* Utf8: See if is ( CR LF ); already know that locinput <
3742                  * PL_regeol, so locinput+1 is in bounds */
3743                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3744                     locinput += 2;
3745                 }
3746                 else {
3747                     /* In case have to backtrack to beginning, then match '.' */
3748                     char *starting = locinput;
3749
3750                     /* In case have to backtrack the last prepend */
3751                     char *previous_prepend = 0;
3752
3753                     LOAD_UTF8_CHARCLASS_GCB();
3754
3755                     /* Match (prepend)* */
3756                     while (locinput < PL_regeol
3757                            && swash_fetch(PL_utf8_X_prepend,
3758                                           (U8*)locinput, utf8_target))
3759                     {
3760                         previous_prepend = locinput;
3761                         locinput += UTF8SKIP(locinput);
3762                     }
3763
3764                     /* As noted above, if we matched a prepend character, but
3765                      * the next thing won't match, back off the last prepend we
3766                      * matched, as it is guaranteed to match the begin */
3767                     if (previous_prepend
3768                         && (locinput >=  PL_regeol
3769                             || ! swash_fetch(PL_utf8_X_begin,
3770                                              (U8*)locinput, utf8_target)))
3771                     {
3772                         locinput = previous_prepend;
3773                     }
3774
3775                     /* Note that here we know PL_regeol > locinput, as we
3776                      * tested that upon input to this switch case, and if we
3777                      * moved locinput forward, we tested the result just above
3778                      * and it either passed, or we backed off so that it will
3779                      * now pass */
3780                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3781
3782                         /* Here did not match the required 'Begin' in the
3783                          * second term.  So just match the very first
3784                          * character, the '.' of the final term of the regex */
3785                         locinput = starting + UTF8SKIP(starting);
3786                     } else {
3787
3788                         /* Here is the beginning of a character that can have
3789                          * an extender.  It is either a hangul syllable, or a
3790                          * non-control */
3791                         if (swash_fetch(PL_utf8_X_non_hangul,
3792                                         (U8*)locinput, utf8_target))
3793                         {
3794
3795                             /* Here not a Hangul syllable, must be a
3796                              * ('!  * Control') */
3797                             locinput += UTF8SKIP(locinput);
3798                         } else {
3799
3800                             /* Here is a Hangul syllable.  It can be composed
3801                              * of several individual characters.  One
3802                              * possibility is T+ */
3803                             if (swash_fetch(PL_utf8_X_T,
3804                                             (U8*)locinput, utf8_target))
3805                             {
3806                                 while (locinput < PL_regeol
3807                                         && swash_fetch(PL_utf8_X_T,
3808                                                         (U8*)locinput, utf8_target))
3809                                 {
3810                                     locinput += UTF8SKIP(locinput);
3811                                 }
3812                             } else {
3813
3814                                 /* Here, not T+, but is a Hangul.  That means
3815                                  * it is one of the others: L, LV, LVT or V,
3816                                  * and matches:
3817                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3818
3819                                 /* Match L*           */
3820                                 while (locinput < PL_regeol
3821                                         && swash_fetch(PL_utf8_X_L,
3822                                                         (U8*)locinput, utf8_target))
3823                                 {
3824                                     locinput += UTF8SKIP(locinput);
3825                                 }
3826
3827                                 /* Here, have exhausted L*.  If the next
3828                                  * character is not an LV, LVT nor V, it means
3829                                  * we had to have at least one L, so matches L+
3830                                  * in the original equation, we have a complete
3831                                  * hangul syllable.  Are done. */
3832
3833                                 if (locinput < PL_regeol
3834                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3835                                                     (U8*)locinput, utf8_target))
3836                                 {
3837
3838                                     /* Otherwise keep going.  Must be LV, LVT
3839                                      * or V.  See if LVT */
3840                                     if (swash_fetch(PL_utf8_X_LVT,
3841                                                     (U8*)locinput, utf8_target))
3842                                     {
3843                                         locinput += UTF8SKIP(locinput);
3844                                     } else {
3845
3846                                         /* Must be  V or LV.  Take it, then
3847                                          * match V*     */
3848                                         locinput += UTF8SKIP(locinput);
3849                                         while (locinput < PL_regeol
3850                                                 && swash_fetch(PL_utf8_X_V,
3851                                                          (U8*)locinput, utf8_target))
3852                                         {
3853                                             locinput += UTF8SKIP(locinput);
3854                                         }
3855                                     }
3856
3857                                     /* And any of LV, LVT, or V can be followed
3858                                      * by T*            */
3859                                     while (locinput < PL_regeol
3860                                            && swash_fetch(PL_utf8_X_T,
3861                                                            (U8*)locinput,
3862                                                            utf8_target))
3863                                     {
3864                                         locinput += UTF8SKIP(locinput);
3865                                     }
3866                                 }
3867                             }
3868                         }
3869
3870                         /* Match any extender */
3871                         while (locinput < PL_regeol
3872                                 && swash_fetch(PL_utf8_X_extend,
3873                                                 (U8*)locinput, utf8_target))
3874                         {
3875                             locinput += UTF8SKIP(locinput);
3876                         }
3877                     }
3878                 }
3879                 if (locinput > PL_regeol) sayNO;
3880             }
3881             nextchr = UCHARAT(locinput);
3882             break;
3883
3884         case NREFFL:
3885         {
3886             char *s;
3887             char type;
3888             PL_reg_flags |= RF_tainted;
3889             /* FALL THROUGH */
3890         case NREF:
3891         case NREFF:
3892             type = OP(scan);
3893             n = reg_check_named_buff_matched(rex,scan);
3894
3895             if ( n ) {
3896                 type = REF + ( type - NREF );
3897                 goto do_ref;
3898             } else {
3899                 sayNO;
3900             }
3901             /* unreached */
3902         case REFFL:
3903             PL_reg_flags |= RF_tainted;
3904             /* FALL THROUGH */
3905         case REF:
3906         case REFF:
3907             n = ARG(scan);  /* which paren pair */
3908             type = OP(scan);
3909           do_ref:
3910             ln = PL_regoffs[n].start;
3911             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
3912             if (*PL_reglastparen < n || ln == -1)
3913                 sayNO;                  /* Do not match unless seen CLOSEn. */
3914             if (ln == PL_regoffs[n].end)
3915                 break;
3916
3917             s = PL_bostr + ln;
3918             if (utf8_target && type != REF) {   /* REF can do byte comparison */
3919                 char *l = locinput;
3920                 const char *e = PL_bostr + PL_regoffs[n].end;
3921                 /*
3922                  * Note that we can't do the "other character" lookup trick as
3923                  * in the 8-bit case (no pun intended) because in Unicode we
3924                  * have to map both upper and title case to lower case.
3925                  */
3926                 if (type == REFF) {
3927                     while (s < e) {
3928                         STRLEN ulen1, ulen2;
3929                         U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3930                         U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3931
3932                         if (l >= PL_regeol)
3933                             sayNO;
3934                         toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3935                         toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3936                         if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3937                             sayNO;
3938                         s += ulen1;
3939                         l += ulen2;
3940                     }
3941                 }
3942                 locinput = l;
3943                 nextchr = UCHARAT(locinput);
3944                 break;
3945             }
3946
3947             /* Inline the first character, for speed. */
3948             if (UCHARAT(s) != nextchr &&
3949                 (type == REF ||
3950                  (UCHARAT(s) != (type == REFF
3951                                   ? PL_fold : PL_fold_locale)[nextchr])))
3952                 sayNO;
3953             ln = PL_regoffs[n].end - ln;
3954             if (locinput + ln > PL_regeol)
3955                 sayNO;
3956             if (ln > 1 && (type == REF
3957                            ? memNE(s, locinput, ln)
3958                            : (type == REFF
3959                               ? ! foldEQ(s, locinput, ln)
3960                               : ! foldEQ_locale(s, locinput, ln))))
3961                 sayNO;
3962             locinput += ln;
3963             nextchr = UCHARAT(locinput);
3964             break;
3965         }
3966         case NOTHING:
3967         case TAIL:
3968             break;
3969         case BACK:
3970             break;
3971
3972 #undef  ST
3973 #define ST st->u.eval
3974         {
3975             SV *ret;
3976             REGEXP *re_sv;
3977             regexp *re;
3978             regexp_internal *rei;
3979             regnode *startpoint;
3980
3981         case GOSTART:
3982         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3983             if (cur_eval && cur_eval->locinput==locinput) {
3984                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3985                     Perl_croak(aTHX_ "Infinite recursion in regex");
3986                 if ( ++nochange_depth > max_nochange_depth )
3987                     Perl_croak(aTHX_
3988                         "Pattern subroutine nesting without pos change"
3989                         " exceeded limit in regex");
3990             } else {
3991                 nochange_depth = 0;
3992             }
3993             re_sv = rex_sv;
3994             re = rex;
3995             rei = rexi;
3996             (void)ReREFCNT_inc(rex_sv);
3997             if (OP(scan)==GOSUB) {
3998                 startpoint = scan + ARG2L(scan);
3999                 ST.close_paren = ARG(scan);
4000             } else {
4001                 startpoint = rei->program+1;
4002                 ST.close_paren = 0;
4003             }
4004             goto eval_recurse_doit;
4005             /* NOTREACHED */
4006         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4007             if (cur_eval && cur_eval->locinput==locinput) {
4008                 if ( ++nochange_depth > max_nochange_depth )
4009                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4010             } else {
4011                 nochange_depth = 0;
4012             }
4013             {
4014                 /* execute the code in the {...} */
4015                 dSP;
4016                 SV ** const before = SP;
4017                 OP_4tree * const oop = PL_op;
4018                 COP * const ocurcop = PL_curcop;
4019                 PAD *old_comppad;
4020                 char *saved_regeol = PL_regeol;
4021                 struct re_save_state saved_state;
4022
4023                 /* To not corrupt the existing regex state while executing the
4024                  * eval we would normally put it on the save stack, like with
4025                  * save_re_context. However, re-evals have a weird scoping so we
4026                  * can't just add ENTER/LEAVE here. With that, things like
4027                  *
4028                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4029                  *
4030                  * would break, as they expect the localisation to be unwound
4031                  * only when the re-engine backtracks through the bit that
4032                  * localised it.
4033                  *
4034                  * What we do instead is just saving the state in a local c
4035                  * variable.
4036                  */
4037                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4038
4039                 n = ARG(scan);
4040                 PL_op = (OP_4tree*)rexi->data->data[n];
4041                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4042                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4043                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4044                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4045
4046                 if (sv_yes_mark) {
4047                     SV *sv_mrk = get_sv("REGMARK", 1);
4048                     sv_setsv(sv_mrk, sv_yes_mark);
4049                 }
4050
4051                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4052                 SPAGAIN;
4053                 if (SP == before)
4054                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4055                 else {
4056                     ret = POPs;
4057                     PUTBACK;
4058                 }
4059
4060                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4061
4062                 PL_op = oop;
4063                 PAD_RESTORE_LOCAL(old_comppad);
4064                 PL_curcop = ocurcop;
4065                 PL_regeol = saved_regeol;
4066                 if (!logical) {
4067                     /* /(?{...})/ */
4068                     sv_setsv(save_scalar(PL_replgv), ret);
4069                     break;
4070                 }
4071             }
4072             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4073                 logical = 0;
4074                 {
4075                     /* extract RE object from returned value; compiling if
4076                      * necessary */
4077                     MAGIC *mg = NULL;
4078                     REGEXP *rx = NULL;
4079
4080                     if (SvROK(ret)) {
4081                         SV *const sv = SvRV(ret);
4082
4083                         if (SvTYPE(sv) == SVt_REGEXP) {
4084                             rx = (REGEXP*) sv;
4085                         } else if (SvSMAGICAL(sv)) {
4086                             mg = mg_find(sv, PERL_MAGIC_qr);
4087                             assert(mg);
4088                         }
4089                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4090                         rx = (REGEXP*) ret;
4091                     } else if (SvSMAGICAL(ret)) {
4092                         if (SvGMAGICAL(ret)) {
4093                             /* I don't believe that there is ever qr magic
4094                                here.  */
4095                             assert(!mg_find(ret, PERL_MAGIC_qr));
4096                             sv_unmagic(ret, PERL_MAGIC_qr);
4097                         }
4098                         else {
4099                             mg = mg_find(ret, PERL_MAGIC_qr);
4100                             /* testing suggests mg only ends up non-NULL for
4101                                scalars who were upgraded and compiled in the
4102                                else block below. In turn, this is only
4103                                triggered in the "postponed utf8 string" tests
4104                                in t/op/pat.t  */
4105                         }
4106                     }
4107
4108                     if (mg) {
4109                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4110                         assert(rx);
4111                     }
4112                     if (rx) {
4113                         rx = reg_temp_copy(NULL, rx);
4114                     }
4115                     else {
4116                         U32 pm_flags = 0;
4117                         const I32 osize = PL_regsize;
4118
4119                         if (DO_UTF8(ret)) {
4120                             assert (SvUTF8(ret));
4121                         } else if (SvUTF8(ret)) {
4122                             /* Not doing UTF-8, despite what the SV says. Is
4123                                this only if we're trapped in use 'bytes'?  */
4124                             /* Make a copy of the octet sequence, but without
4125                                the flag on, as the compiler now honours the
4126                                SvUTF8 flag on ret.  */
4127                             STRLEN len;
4128                             const char *const p = SvPV(ret, len);
4129                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4130                         }
4131                         rx = CALLREGCOMP(ret, pm_flags);
4132                         if (!(SvFLAGS(ret)
4133                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4134                                  | SVs_GMG))) {
4135                             /* This isn't a first class regexp. Instead, it's
4136                                caching a regexp onto an existing, Perl visible
4137                                scalar.  */
4138                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4139                         }
4140                         PL_regsize = osize;
4141                     }
4142                     re_sv = rx;
4143                     re = (struct regexp *)SvANY(rx);
4144                 }
4145                 RXp_MATCH_COPIED_off(re);
4146                 re->subbeg = rex->subbeg;
4147                 re->sublen = rex->sublen;
4148                 rei = RXi_GET(re);
4149                 DEBUG_EXECUTE_r(
4150                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4151                         "Matching embedded");
4152                 );
4153                 startpoint = rei->program + 1;
4154                 ST.close_paren = 0; /* only used for GOSUB */
4155                 /* borrowed from regtry */
4156                 if (PL_reg_start_tmpl <= re->nparens) {
4157                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4158                     if(PL_reg_start_tmp)
4159                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4160                     else
4161                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4162                 }
4163
4164         eval_recurse_doit: /* Share code with GOSUB below this line */
4165                 /* run the pattern returned from (??{...}) */
4166                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4167                 REGCP_SET(ST.lastcp);
4168
4169                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4170
4171                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4172                 PL_reglastparen = &re->lastparen;
4173                 PL_reglastcloseparen = &re->lastcloseparen;
4174                 re->lastparen = 0;
4175                 re->lastcloseparen = 0;
4176
4177                 PL_reginput = locinput;
4178                 PL_regsize = 0;
4179
4180                 /* XXXX This is too dramatic a measure... */
4181                 PL_reg_maxiter = 0;
4182
4183                 ST.toggle_reg_flags = PL_reg_flags;
4184                 if (RX_UTF8(re_sv))
4185                     PL_reg_flags |= RF_utf8;
4186                 else
4187                     PL_reg_flags &= ~RF_utf8;
4188                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4189
4190                 ST.prev_rex = rex_sv;
4191                 ST.prev_curlyx = cur_curlyx;
4192                 SETREX(rex_sv,re_sv);
4193                 rex = re;
4194                 rexi = rei;
4195                 cur_curlyx = NULL;
4196                 ST.B = next;
4197                 ST.prev_eval = cur_eval;
4198                 cur_eval = st;
4199                 /* now continue from first node in postoned RE */
4200                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4201                 /* NOTREACHED */
4202             }
4203             /* logical is 1,   /(?(?{...})X|Y)/ */
4204             sw = cBOOL(SvTRUE(ret));
4205             logical = 0;
4206             break;
4207         }
4208
4209         case EVAL_AB: /* cleanup after a successful (??{A})B */
4210             /* note: this is called twice; first after popping B, then A */
4211             PL_reg_flags ^= ST.toggle_reg_flags;
4212             ReREFCNT_dec(rex_sv);
4213             SETREX(rex_sv,ST.prev_rex);
4214             rex = (struct regexp *)SvANY(rex_sv);
4215             rexi = RXi_GET(rex);
4216             regcpblow(ST.cp);
4217             cur_eval = ST.prev_eval;
4218             cur_curlyx = ST.prev_curlyx;
4219
4220             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4221             PL_reglastparen = &rex->lastparen;
4222             PL_reglastcloseparen = &rex->lastcloseparen;
4223             /* also update PL_regoffs */
4224             PL_regoffs = rex->offs;
4225
4226             /* XXXX This is too dramatic a measure... */
4227             PL_reg_maxiter = 0;
4228             if ( nochange_depth )
4229                 nochange_depth--;
4230             sayYES;
4231
4232
4233         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4234             /* note: this is called twice; first after popping B, then A */
4235             PL_reg_flags ^= ST.toggle_reg_flags;
4236             ReREFCNT_dec(rex_sv);
4237             SETREX(rex_sv,ST.prev_rex);
4238             rex = (struct regexp *)SvANY(rex_sv);
4239             rexi = RXi_GET(rex);
4240             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4241             PL_reglastparen = &rex->lastparen;
4242             PL_reglastcloseparen = &rex->lastcloseparen;
4243
4244             PL_reginput = locinput;
4245             REGCP_UNWIND(ST.lastcp);
4246             regcppop(rex);
4247             cur_eval = ST.prev_eval;
4248             cur_curlyx = ST.prev_curlyx;
4249             /* XXXX This is too dramatic a measure... */
4250             PL_reg_maxiter = 0;
4251             if ( nochange_depth )
4252                 nochange_depth--;
4253             sayNO_SILENT;
4254 #undef ST
4255
4256         case OPEN:
4257             n = ARG(scan);  /* which paren pair */
4258             PL_reg_start_tmp[n] = locinput;
4259             if (n > PL_regsize)
4260                 PL_regsize = n;
4261             lastopen = n;
4262             break;
4263         case CLOSE:
4264             n = ARG(scan);  /* which paren pair */
4265             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4266             PL_regoffs[n].end = locinput - PL_bostr;
4267             /*if (n > PL_regsize)
4268                 PL_regsize = n;*/
4269             if (n > *PL_reglastparen)
4270                 *PL_reglastparen = n;
4271             *PL_reglastcloseparen = n;
4272             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4273                 goto fake_end;
4274             }
4275             break;
4276         case ACCEPT:
4277             if (ARG(scan)){
4278                 regnode *cursor;
4279                 for (cursor=scan;
4280                      cursor && OP(cursor)!=END;
4281                      cursor=regnext(cursor))
4282                 {
4283                     if ( OP(cursor)==CLOSE ){
4284                         n = ARG(cursor);
4285                         if ( n <= lastopen ) {
4286                             PL_regoffs[n].start
4287                                 = PL_reg_start_tmp[n] - PL_bostr;
4288                             PL_regoffs[n].end = locinput - PL_bostr;
4289                             /*if (n > PL_regsize)
4290                             PL_regsize = n;*/
4291                             if (n > *PL_reglastparen)
4292                                 *PL_reglastparen = n;
4293                             *PL_reglastcloseparen = n;
4294                             if ( n == ARG(scan) || (cur_eval &&
4295                                 cur_eval->u.eval.close_paren == n))
4296                                 break;
4297                         }
4298                     }
4299                 }
4300             }
4301             goto fake_end;
4302             /*NOTREACHED*/
4303         case GROUPP:
4304             n = ARG(scan);  /* which paren pair */
4305             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4306             break;
4307         case NGROUPP:
4308             /* reg_check_named_buff_matched returns 0 for no match */
4309             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4310             break;
4311         case INSUBP:
4312             n = ARG(scan);
4313             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4314             break;
4315         case DEFINEP:
4316             sw = 0;
4317             break;
4318         case IFTHEN:
4319             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4320             if (sw)
4321                 next = NEXTOPER(NEXTOPER(scan));
4322             else {
4323                 next = scan + ARG(scan);
4324                 if (OP(next) == IFTHEN) /* Fake one. */
4325                     next = NEXTOPER(NEXTOPER(next));
4326             }
4327             break;
4328         case LOGICAL:
4329             logical = scan->flags;
4330             break;
4331
4332 /*******************************************************************
4333
4334 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4335 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4336 STAR/PLUS/CURLY/CURLYN are used instead.)
4337
4338 A*B is compiled as <CURLYX><A><WHILEM><B>
4339
4340 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4341 state, which contains the current count, initialised to -1. It also sets
4342 cur_curlyx to point to this state, with any previous value saved in the
4343 state block.
4344
4345 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4346 since the pattern may possibly match zero times (i.e. it's a while {} loop
4347 rather than a do {} while loop).
4348
4349 Each entry to WHILEM represents a successful match of A. The count in the
4350 CURLYX block is incremented, another WHILEM state is pushed, and execution
4351 passes to A or B depending on greediness and the current count.
4352
4353 For example, if matching against the string a1a2a3b (where the aN are
4354 substrings that match /A/), then the match progresses as follows: (the
4355 pushed states are interspersed with the bits of strings matched so far):
4356
4357     <CURLYX cnt=-1>
4358     <CURLYX cnt=0><WHILEM>
4359     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4360     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4361     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4362     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4363
4364 (Contrast this with something like CURLYM, which maintains only a single
4365 backtrack state:
4366
4367     <CURLYM cnt=0> a1
4368     a1 <CURLYM cnt=1> a2
4369     a1 a2 <CURLYM cnt=2> a3
4370     a1 a2 a3 <CURLYM cnt=3> b
4371 )
4372
4373 Each WHILEM state block marks a point to backtrack to upon partial failure
4374 of A or B, and also contains some minor state data related to that
4375 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4376 overall state, such as the count, and pointers to the A and B ops.
4377
4378 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4379 must always point to the *current* CURLYX block, the rules are:
4380
4381 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4382 and set cur_curlyx to point the new block.
4383
4384 When popping the CURLYX block after a successful or unsuccessful match,
4385 restore the previous cur_curlyx.
4386
4387 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4388 to the outer one saved in the CURLYX block.
4389
4390 When popping the WHILEM block after a successful or unsuccessful B match,
4391 restore the previous cur_curlyx.
4392
4393 Here's an example for the pattern (AI* BI)*BO
4394 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4395
4396 cur_
4397 curlyx backtrack stack
4398 ------ ---------------
4399 NULL
4400 CO     <CO prev=NULL> <WO>
4401 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4402 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4403 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4404
4405 At this point the pattern succeeds, and we work back down the stack to
4406 clean up, restoring as we go:
4407
4408 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4409 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4410 CO     <CO prev=NULL> <WO>
4411 NULL
4412
4413 *******************************************************************/
4414
4415 #define ST st->u.curlyx
4416
4417         case CURLYX:    /* start of /A*B/  (for complex A) */
4418         {
4419             /* No need to save/restore up to this paren */
4420             I32 parenfloor = scan->flags;
4421
4422             assert(next); /* keep Coverity happy */
4423             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4424                 next += ARG(next);
4425
4426             /* XXXX Probably it is better to teach regpush to support
4427                parenfloor > PL_regsize... */
4428             if (parenfloor > (I32)*PL_reglastparen)
4429                 parenfloor = *PL_reglastparen; /* Pessimization... */
4430
4431             ST.prev_curlyx= cur_curlyx;
4432             cur_curlyx = st;
4433             ST.cp = PL_savestack_ix;
4434
4435             /* these fields contain the state of the current curly.
4436              * they are accessed by subsequent WHILEMs */
4437             ST.parenfloor = parenfloor;
4438             ST.me = scan;
4439             ST.B = next;
4440             ST.minmod = minmod;
4441             minmod = 0;
4442             ST.count = -1;      /* this will be updated by WHILEM */
4443             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4444
4445             PL_reginput = locinput;
4446             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4447             /* NOTREACHED */
4448         }
4449
4450         case CURLYX_end: /* just finished matching all of A*B */
4451             cur_curlyx = ST.prev_curlyx;
4452             sayYES;
4453             /* NOTREACHED */
4454
4455         case CURLYX_end_fail: /* just failed to match all of A*B */
4456             regcpblow(ST.cp);
4457             cur_curlyx = ST.prev_curlyx;
4458             sayNO;
4459             /* NOTREACHED */
4460
4461
4462 #undef ST
4463 #define ST st->u.whilem
4464
4465         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4466         {
4467             /* see the discussion above about CURLYX/WHILEM */
4468             I32 n;
4469             int min = ARG1(cur_curlyx->u.curlyx.me);
4470             int max = ARG2(cur_curlyx->u.curlyx.me);
4471             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4472
4473             assert(cur_curlyx); /* keep Coverity happy */
4474             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4475             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4476             ST.cache_offset = 0;
4477             ST.cache_mask = 0;
4478
4479             PL_reginput = locinput;
4480
4481             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4482                   "%*s  whilem: matched %ld out of %d..%d\n",
4483                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4484             );
4485
4486             /* First just match a string of min A's. */
4487
4488             if (n < min) {
4489                 cur_curlyx->u.curlyx.lastloc = locinput;
4490                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4491                 /* NOTREACHED */
4492             }
4493
4494             /* If degenerate A matches "", assume A done. */
4495
4496             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4497                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4498                    "%*s  whilem: empty match detected, trying continuation...\n",
4499                    REPORT_CODE_OFF+depth*2, "")
4500                 );
4501                 goto do_whilem_B_max;
4502             }
4503
4504             /* super-linear cache processing */
4505
4506             if (scan->flags) {
4507
4508                 if (!PL_reg_maxiter) {
4509                     /* start the countdown: Postpone detection until we
4510                      * know the match is not *that* much linear. */
4511                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4512                     /* possible overflow for long strings and many CURLYX's */
4513                     if (PL_reg_maxiter < 0)
4514                         PL_reg_maxiter = I32_MAX;
4515                     PL_reg_leftiter = PL_reg_maxiter;
4516                 }
4517
4518                 if (PL_reg_leftiter-- == 0) {
4519                     /* initialise cache */
4520                     const I32 size = (PL_reg_maxiter + 7)/8;
4521                     if (PL_reg_poscache) {
4522                         if ((I32)PL_reg_poscache_size < size) {
4523                             Renew(PL_reg_poscache, size, char);
4524                             PL_reg_poscache_size = size;
4525                         }
4526                         Zero(PL_reg_poscache, size, char);
4527                     }
4528                     else {
4529                         PL_reg_poscache_size = size;
4530                         Newxz(PL_reg_poscache, size, char);
4531                     }
4532                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4533       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4534                               PL_colors[4], PL_colors[5])
4535                     );
4536                 }
4537
4538                 if (PL_reg_leftiter < 0) {
4539                     /* have we already failed at this position? */
4540                     I32 offset, mask;
4541                     offset  = (scan->flags & 0xf) - 1
4542                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4543                     mask    = 1 << (offset % 8);
4544                     offset /= 8;
4545                     if (PL_reg_poscache[offset] & mask) {
4546                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4547                             "%*s  whilem: (cache) already tried at this position...\n",
4548                             REPORT_CODE_OFF+depth*2, "")
4549                         );
4550                         sayNO; /* cache records failure */
4551                     }
4552                     ST.cache_offset = offset;
4553                     ST.cache_mask   = mask;
4554                 }
4555             }
4556
4557             /* Prefer B over A for minimal matching. */
4558
4559             if (cur_curlyx->u.curlyx.minmod) {
4560                 ST.save_curlyx = cur_curlyx;
4561                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4562                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4563                 REGCP_SET(ST.lastcp);
4564                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4565                 /* NOTREACHED */
4566             }
4567
4568             /* Prefer A over B for maximal matching. */
4569
4570             if (n < max) { /* More greed allowed? */
4571                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4572                 cur_curlyx->u.curlyx.lastloc = locinput;
4573                 REGCP_SET(ST.lastcp);
4574                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4575                 /* NOTREACHED */
4576             }
4577             goto do_whilem_B_max;
4578         }
4579         /* NOTREACHED */
4580
4581         case WHILEM_B_min: /* just matched B in a minimal match */
4582         case WHILEM_B_max: /* just matched B in a maximal match */
4583             cur_curlyx = ST.save_curlyx;
4584             sayYES;
4585             /* NOTREACHED */
4586
4587         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4588             cur_curlyx = ST.save_curlyx;
4589             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4590             cur_curlyx->u.curlyx.count--;
4591             CACHEsayNO;
4592             /* NOTREACHED */
4593
4594         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4595             REGCP_UNWIND(ST.lastcp);
4596             regcppop(rex);
4597             /* FALL THROUGH */
4598         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4599             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4600             cur_curlyx->u.curlyx.count--;
4601             CACHEsayNO;
4602             /* NOTREACHED */
4603
4604         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4605             REGCP_UNWIND(ST.lastcp);
4606             regcppop(rex);      /* Restore some previous $<digit>s? */
4607             PL_reginput = locinput;
4608             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4609                 "%*s  whilem: failed, trying continuation...\n",
4610                 REPORT_CODE_OFF+depth*2, "")
4611             );
4612           do_whilem_B_max:
4613             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4614                 && ckWARN(WARN_REGEXP)
4615                 && !(PL_reg_flags & RF_warned))
4616             {
4617                 PL_reg_flags |= RF_warned;
4618                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4619                      "Complex regular subexpression recursion",
4620                      REG_INFTY - 1);
4621             }
4622
4623             /* now try B */
4624             ST.save_curlyx = cur_curlyx;
4625             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4626             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4627             /* NOTREACHED */
4628
4629         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4630             cur_curlyx = ST.save_curlyx;
4631             REGCP_UNWIND(ST.lastcp);
4632             regcppop(rex);
4633
4634             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4635                 /* Maximum greed exceeded */
4636                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4637                     && ckWARN(WARN_REGEXP)
4638                     && !(PL_reg_flags & RF_warned))
4639                 {
4640                     PL_reg_flags |= RF_warned;
4641                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4642                         "%s limit (%d) exceeded",
4643                         "Complex regular subexpression recursion",
4644                         REG_INFTY - 1);
4645                 }
4646                 cur_curlyx->u.curlyx.count--;
4647                 CACHEsayNO;
4648             }
4649
4650             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4651                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4652             );
4653             /* Try grabbing another A and see if it helps. */
4654             PL_reginput = locinput;
4655             cur_curlyx->u.curlyx.lastloc = locinput;
4656             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4657             REGCP_SET(ST.lastcp);
4658             PUSH_STATE_GOTO(WHILEM_A_min,
4659                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4660             /* NOTREACHED */
4661
4662 #undef  ST
4663 #define ST st->u.branch
4664
4665         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4666             next = scan + ARG(scan);
4667             if (next == scan)
4668                 next = NULL;
4669             scan = NEXTOPER(scan);
4670             /* FALL THROUGH */
4671
4672         case BRANCH:        /*  /(...|A|...)/ */
4673             scan = NEXTOPER(scan); /* scan now points to inner node */
4674             ST.lastparen = *PL_reglastparen;
4675             ST.next_branch = next;
4676             REGCP_SET(ST.cp);
4677             PL_reginput = locinput;
4678
4679             /* Now go into the branch */
4680             if (has_cutgroup) {
4681                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4682             } else {
4683                 PUSH_STATE_GOTO(BRANCH_next, scan);
4684             }
4685             /* NOTREACHED */
4686         case CUTGROUP:
4687             PL_reginput = locinput;
4688             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4689                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4690             PUSH_STATE_GOTO(CUTGROUP_next,next);
4691             /* NOTREACHED */
4692         case CUTGROUP_next_fail:
4693             do_cutgroup = 1;
4694             no_final = 1;
4695             if (st->u.mark.mark_name)
4696                 sv_commit = st->u.mark.mark_name;
4697             sayNO;
4698             /* NOTREACHED */
4699         case BRANCH_next:
4700             sayYES;
4701             /* NOTREACHED */
4702         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4703             if (do_cutgroup) {
4704                 do_cutgroup = 0;
4705                 no_final = 0;
4706             }
4707             REGCP_UNWIND(ST.cp);
4708             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4709                 PL_regoffs[n].end = -1;
4710             *PL_reglastparen = n;
4711             /*dmq: *PL_reglastcloseparen = n; */
4712             scan = ST.next_branch;
4713             /* no more branches? */
4714             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4715                 DEBUG_EXECUTE_r({
4716                     PerlIO_printf( Perl_debug_log,
4717                         "%*s  %sBRANCH failed...%s\n",
4718                         REPORT_CODE_OFF+depth*2, "",
4719                         PL_colors[4],
4720                         PL_colors[5] );
4721                 });
4722                 sayNO_SILENT;
4723             }
4724             continue; /* execute next BRANCH[J] op */
4725             /* NOTREACHED */
4726
4727         case MINMOD:
4728             minmod = 1;
4729             break;
4730
4731 #undef  ST
4732 #define ST st->u.curlym
4733
4734         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4735
4736             /* This is an optimisation of CURLYX that enables us to push
4737              * only a single backtracking state, no matter how many matches
4738              * there are in {m,n}. It relies on the pattern being constant
4739              * length, with no parens to influence future backrefs
4740              */
4741
4742             ST.me = scan;
4743             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4744
4745             /* if paren positive, emulate an OPEN/CLOSE around A */
4746             if (ST.me->flags) {
4747                 U32 paren = ST.me->flags;
4748                 if (paren > PL_regsize)
4749                     PL_regsize = paren;
4750                 if (paren > *PL_reglastparen)
4751                     *PL_reglastparen = paren;
4752                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4753             }
4754             ST.A = scan;
4755             ST.B = next;
4756             ST.alen = 0;
4757             ST.count = 0;
4758             ST.minmod = minmod;
4759             minmod = 0;
4760             ST.c1 = CHRTEST_UNINIT;
4761             REGCP_SET(ST.cp);
4762
4763             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4764                 goto curlym_do_B;
4765
4766           curlym_do_A: /* execute the A in /A{m,n}B/  */
4767             PL_reginput = locinput;
4768             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4769             /* NOTREACHED */
4770
4771         case CURLYM_A: /* we've just matched an A */
4772             locinput = st->locinput;
4773             nextchr = UCHARAT(locinput);
4774
4775             ST.count++;
4776             /* after first match, determine A's length: u.curlym.alen */
4777             if (ST.count == 1) {
4778                 if (PL_reg_match_utf8) {
4779                     char *s = locinput;
4780                     while (s < PL_reginput) {
4781                         ST.alen++;
4782                         s += UTF8SKIP(s);
4783                     }
4784                 }
4785                 else {
4786                     ST.alen = PL_reginput - locinput;
4787                 }
4788                 if (ST.alen == 0)
4789                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4790             }
4791             DEBUG_EXECUTE_r(
4792                 PerlIO_printf(Perl_debug_log,
4793                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4794                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4795                           (IV) ST.count, (IV)ST.alen)
4796             );
4797
4798             locinput = PL_reginput;
4799
4800             if (cur_eval && cur_eval->u.eval.close_paren &&
4801                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4802                 goto fake_end;
4803
4804             {
4805                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4806                 if ( max == REG_INFTY || ST.count < max )
4807                     goto curlym_do_A; /* try to match another A */
4808             }
4809             goto curlym_do_B; /* try to match B */
4810
4811         case CURLYM_A_fail: /* just failed to match an A */
4812             REGCP_UNWIND(ST.cp);
4813
4814             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4815                 || (cur_eval && cur_eval->u.eval.close_paren &&
4816                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4817                 sayNO;
4818
4819           curlym_do_B: /* execute the B in /A{m,n}B/  */
4820             PL_reginput = locinput;
4821             if (ST.c1 == CHRTEST_UNINIT) {
4822                 /* calculate c1 and c2 for possible match of 1st char
4823                  * following curly */
4824                 ST.c1 = ST.c2 = CHRTEST_VOID;
4825                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4826                     regnode *text_node = ST.B;
4827                     if (! HAS_TEXT(text_node))
4828                         FIND_NEXT_IMPT(text_node);
4829                     /* this used to be
4830
4831                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4832
4833                         But the former is redundant in light of the latter.
4834
4835                         if this changes back then the macro for
4836                         IS_TEXT and friends need to change.
4837                      */
4838                     if (PL_regkind[OP(text_node)] == EXACT)
4839                     {
4840
4841                         ST.c1 = (U8)*STRING(text_node);
4842                         ST.c2 =
4843                             (IS_TEXTF(text_node))
4844                             ? PL_fold[ST.c1]
4845                             : (IS_TEXTFL(text_node))
4846                                 ? PL_fold_locale[ST.c1]
4847                                 : ST.c1;
4848                     }
4849                 }
4850             }
4851
4852             DEBUG_EXECUTE_r(
4853                 PerlIO_printf(Perl_debug_log,
4854                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4855                     (int)(REPORT_CODE_OFF+(depth*2)),
4856                     "", (IV)ST.count)
4857                 );
4858             if (ST.c1 != CHRTEST_VOID
4859                     && UCHARAT(PL_reginput) != ST.c1
4860                     && UCHARAT(PL_reginput) != ST.c2)
4861             {
4862                 /* simulate B failing */
4863                 DEBUG_OPTIMISE_r(
4864                     PerlIO_printf(Perl_debug_log,
4865                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4866                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4867                         (IV)ST.c1,(IV)ST.c2
4868                 ));
4869                 state_num = CURLYM_B_fail;
4870                 goto reenter_switch;
4871             }
4872
4873             if (ST.me->flags) {
4874                 /* mark current A as captured */
4875                 I32 paren = ST.me->flags;
4876                 if (ST.count) {
4877                     PL_regoffs[paren].start
4878                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4879                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4880                     /*dmq: *PL_reglastcloseparen = paren; */
4881                 }
4882                 else
4883                     PL_regoffs[paren].end = -1;
4884                 if (cur_eval && cur_eval->u.eval.close_paren &&
4885                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4886                 {
4887                     if (ST.count)
4888                         goto fake_end;
4889                     else
4890                         sayNO;
4891                 }
4892             }
4893
4894             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4895             /* NOTREACHED */
4896
4897         case CURLYM_B_fail: /* just failed to match a B */
4898             REGCP_UNWIND(ST.cp);
4899             if (ST.minmod) {
4900                 I32 max = ARG2(ST.me);
4901                 if (max != REG_INFTY && ST.count == max)
4902                     sayNO;
4903                 goto curlym_do_A; /* try to match a further A */
4904             }
4905             /* backtrack one A */
4906             if (ST.count == ARG1(ST.me) /* min */)
4907                 sayNO;
4908             ST.count--;
4909             locinput = HOPc(locinput, -ST.alen);
4910             goto curlym_do_B; /* try to match B */
4911
4912 #undef ST
4913 #define ST st->u.curly
4914
4915 #define CURLY_SETPAREN(paren, success) \
4916     if (paren) { \
4917         if (success) { \
4918             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4919             PL_regoffs[paren].end = locinput - PL_bostr; \
4920             *PL_reglastcloseparen = paren; \
4921         } \
4922         else \
4923             PL_regoffs[paren].end = -1; \
4924     }
4925
4926         case STAR:              /*  /A*B/ where A is width 1 */
4927             ST.paren = 0;
4928             ST.min = 0;
4929             ST.max = REG_INFTY;
4930             scan = NEXTOPER(scan);
4931             goto repeat;
4932         case PLUS:              /*  /A+B/ where A is width 1 */
4933             ST.paren = 0;
4934             ST.min = 1;
4935             ST.max = REG_INFTY;
4936             scan = NEXTOPER(scan);
4937             goto repeat;
4938         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
4939             ST.paren = scan->flags;     /* Which paren to set */
4940             if (ST.paren > PL_regsize)
4941                 PL_regsize = ST.paren;
4942             if (ST.paren > *PL_reglastparen)
4943                 *PL_reglastparen = ST.paren;
4944             ST.min = ARG1(scan);  /* min to match */
4945             ST.max = ARG2(scan);  /* max to match */
4946             if (cur_eval && cur_eval->u.eval.close_paren &&
4947                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4948                 ST.min=1;
4949                 ST.max=1;
4950             }
4951             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4952             goto repeat;
4953         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
4954             ST.paren = 0;
4955             ST.min = ARG1(scan);  /* min to match */
4956             ST.max = ARG2(scan);  /* max to match */
4957             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4958           repeat:
4959             /*
4960             * Lookahead to avoid useless match attempts
4961             * when we know what character comes next.
4962             *
4963             * Used to only do .*x and .*?x, but now it allows
4964             * for )'s, ('s and (?{ ... })'s to be in the way
4965             * of the quantifier and the EXACT-like node.  -- japhy
4966             */
4967
4968             if (ST.min > ST.max) /* XXX make this a compile-time check? */
4969                 sayNO;
4970             if (HAS_TEXT(next) || JUMPABLE(next)) {
4971                 U8 *s;
4972                 regnode *text_node = next;
4973
4974                 if (! HAS_TEXT(text_node))
4975                     FIND_NEXT_IMPT(text_node);
4976
4977                 if (! HAS_TEXT(text_node))
4978                     ST.c1 = ST.c2 = CHRTEST_VOID;
4979                 else {
4980                     if ( PL_regkind[OP(text_node)] != EXACT ) {
4981                         ST.c1 = ST.c2 = CHRTEST_VOID;
4982                         goto assume_ok_easy;
4983                     }
4984                     else
4985                         s = (U8*)STRING(text_node);
4986
4987                     /*  Currently we only get here when
4988
4989                         PL_rekind[OP(text_node)] == EXACT
4990
4991                         if this changes back then the macro for IS_TEXT and
4992                         friends need to change. */
4993                     if (!UTF_PATTERN) {
4994                         ST.c2 = ST.c1 = *s;
4995                         if (IS_TEXTF(text_node))
4996                             ST.c2 = PL_fold[ST.c1];
4997                         else if (IS_TEXTFL(text_node))
4998                             ST.c2 = PL_fold_locale[ST.c1];
4999                     }
5000                     else { /* UTF_PATTERN */
5001                         if (IS_TEXTF(text_node)) {
5002                              STRLEN ulen1, ulen2;
5003                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5004                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5005
5006                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5007                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5008 #ifdef EBCDIC
5009                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5010                                                     ckWARN(WARN_UTF8) ?
5011                                                     0 : UTF8_ALLOW_ANY);
5012                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5013                                                     ckWARN(WARN_UTF8) ?
5014                                                     0 : UTF8_ALLOW_ANY);
5015 #else
5016                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5017                                                     uniflags);
5018                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5019                                                     uniflags);
5020 #endif
5021                         }
5022                         else {
5023                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5024                                                      uniflags);
5025                         }
5026                     }
5027                 }
5028             }
5029             else
5030                 ST.c1 = ST.c2 = CHRTEST_VOID;
5031         assume_ok_easy:
5032
5033             ST.A = scan;
5034             ST.B = next;
5035             PL_reginput = locinput;
5036             if (minmod) {
5037                 minmod = 0;
5038                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5039                     sayNO;
5040                 ST.count = ST.min;
5041                 locinput = PL_reginput;
5042                 REGCP_SET(ST.cp);
5043                 if (ST.c1 == CHRTEST_VOID)
5044                     goto curly_try_B_min;
5045
5046                 ST.oldloc = locinput;
5047
5048                 /* set ST.maxpos to the furthest point along the
5049                  * string that could possibly match */
5050                 if  (ST.max == REG_INFTY) {
5051                     ST.maxpos = PL_regeol - 1;
5052                     if (utf8_target)
5053                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5054                             ST.maxpos--;
5055                 }
5056                 else if (utf8_target) {
5057                     int m = ST.max - ST.min;
5058                     for (ST.maxpos = locinput;
5059                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5060                         ST.maxpos += UTF8SKIP(ST.maxpos);
5061                 }
5062                 else {
5063                     ST.maxpos = locinput + ST.max - ST.min;
5064                     if (ST.maxpos >= PL_regeol)
5065                         ST.maxpos = PL_regeol - 1;
5066                 }
5067                 goto curly_try_B_min_known;
5068
5069             }
5070             else {
5071                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5072                 locinput = PL_reginput;
5073                 if (ST.count < ST.min)
5074                     sayNO;
5075                 if ((ST.count > ST.min)
5076                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5077                 {
5078                     /* A{m,n} must come at the end of the string, there's
5079                      * no point in backing off ... */
5080                     ST.min = ST.count;
5081                     /* ...except that $ and \Z can match before *and* after
5082                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5083                        We may back off by one in this case. */
5084                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5085                         ST.min--;
5086                 }
5087                 REGCP_SET(ST.cp);
5088                 goto curly_try_B_max;
5089             }
5090             /* NOTREACHED */
5091
5092
5093         case CURLY_B_min_known_fail:
5094             /* failed to find B in a non-greedy match where c1,c2 valid */
5095             if (ST.paren && ST.count)
5096                 PL_regoffs[ST.paren].end = -1;
5097
5098             PL_reginput = locinput;     /* Could be reset... */
5099             REGCP_UNWIND(ST.cp);
5100             /* Couldn't or didn't -- move forward. */
5101             ST.oldloc = locinput;
5102             if (utf8_target)
5103                 locinput += UTF8SKIP(locinput);
5104             else
5105                 locinput++;
5106             ST.count++;
5107           curly_try_B_min_known:
5108              /* find the next place where 'B' could work, then call B */
5109             {
5110                 int n;
5111                 if (utf8_target) {
5112                     n = (ST.oldloc == locinput) ? 0 : 1;
5113                     if (ST.c1 == ST.c2) {
5114                         STRLEN len;
5115                         /* set n to utf8_distance(oldloc, locinput) */
5116                         while (locinput <= ST.maxpos &&
5117                                utf8n_to_uvchr((U8*)locinput,
5118                                               UTF8_MAXBYTES, &len,
5119                                               uniflags) != (UV)ST.c1) {
5120                             locinput += len;
5121                             n++;
5122                         }
5123                     }
5124                     else {
5125                         /* set n to utf8_distance(oldloc, locinput) */
5126                         while (locinput <= ST.maxpos) {
5127                             STRLEN len;
5128                             const UV c = utf8n_to_uvchr((U8*)locinput,
5129                                                   UTF8_MAXBYTES, &len,
5130                                                   uniflags);
5131                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5132                                 break;
5133                             locinput += len;
5134                             n++;
5135                         }
5136                     }
5137                 }
5138                 else {
5139                     if (ST.c1 == ST.c2) {
5140                         while (locinput <= ST.maxpos &&
5141                                UCHARAT(locinput) != ST.c1)
5142                             locinput++;
5143                     }
5144                     else {
5145                         while (locinput <= ST.maxpos
5146                                && UCHARAT(locinput) != ST.c1
5147                                && UCHARAT(locinput) != ST.c2)
5148                             locinput++;
5149                     }
5150                     n = locinput - ST.oldloc;
5151                 }
5152                 if (locinput > ST.maxpos)
5153                     sayNO;
5154                 /* PL_reginput == oldloc now */
5155                 if (n) {
5156                     ST.count += n;
5157                     if (regrepeat(rex, ST.A, n, depth) < n)
5158                         sayNO;
5159                 }
5160                 PL_reginput = locinput;
5161                 CURLY_SETPAREN(ST.paren, ST.count);
5162                 if (cur_eval && cur_eval->u.eval.close_paren &&
5163                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5164                     goto fake_end;
5165                 }
5166                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5167             }
5168             /* NOTREACHED */
5169
5170
5171         case CURLY_B_min_fail:
5172             /* failed to find B in a non-greedy match where c1,c2 invalid */
5173             if (ST.paren && ST.count)
5174                 PL_regoffs[ST.paren].end = -1;
5175
5176             REGCP_UNWIND(ST.cp);
5177             /* failed -- move forward one */
5178             PL_reginput = locinput;
5179             if (regrepeat(rex, ST.A, 1, depth)) {
5180                 ST.count++;
5181                 locinput = PL_reginput;
5182                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5183                         ST.count > 0)) /* count overflow ? */
5184                 {
5185                   curly_try_B_min:
5186                     CURLY_SETPAREN(ST.paren, ST.count);
5187                     if (cur_eval && cur_eval->u.eval.close_paren &&
5188                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5189                         goto fake_end;
5190                     }
5191                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5192                 }
5193             }
5194             sayNO;
5195             /* NOTREACHED */
5196
5197
5198         curly_try_B_max:
5199             /* a successful greedy match: now try to match B */
5200             if (cur_eval && cur_eval->u.eval.close_paren &&
5201                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5202                 goto fake_end;
5203             }
5204             {
5205                 UV c = 0;
5206                 if (ST.c1 != CHRTEST_VOID)
5207                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5208                                            UTF8_MAXBYTES, 0, uniflags)
5209                                 : (UV) UCHARAT(PL_reginput);
5210                 /* If it could work, try it. */
5211                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5212                     CURLY_SETPAREN(ST.paren, ST.count);
5213                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5214                     /* NOTREACHED */
5215                 }
5216             }
5217             /* FALL THROUGH */
5218         case CURLY_B_max_fail:
5219             /* failed to find B in a greedy match */
5220             if (ST.paren && ST.count)
5221                 PL_regoffs[ST.paren].end = -1;
5222
5223             REGCP_UNWIND(ST.cp);
5224             /*  back up. */
5225             if (--ST.count < ST.min)
5226                 sayNO;
5227             PL_reginput = locinput = HOPc(locinput, -1);
5228             goto curly_try_B_max;
5229
5230 #undef ST
5231
5232         case END:
5233             fake_end:
5234             if (cur_eval) {
5235                 /* we've just finished A in /(??{A})B/; now continue with B */
5236                 I32 tmpix;
5237                 st->u.eval.toggle_reg_flags
5238                             = cur_eval->u.eval.toggle_reg_flags;
5239                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5240
5241                 st->u.eval.prev_rex = rex_sv;           /* inner */
5242                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5243                 rex = (struct regexp *)SvANY(rex_sv);
5244                 rexi = RXi_GET(rex);
5245                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5246                 ReREFCNT_inc(rex_sv);
5247                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5248
5249                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5250                 PL_reglastparen = &rex->lastparen;
5251                 PL_reglastcloseparen = &rex->lastcloseparen;
5252
5253                 REGCP_SET(st->u.eval.lastcp);
5254                 PL_reginput = locinput;
5255
5256                 /* Restore parens of the outer rex without popping the
5257                  * savestack */
5258                 tmpix = PL_savestack_ix;
5259                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5260                 regcppop(rex);
5261                 PL_savestack_ix = tmpix;
5262
5263                 st->u.eval.prev_eval = cur_eval;
5264                 cur_eval = cur_eval->u.eval.prev_eval;
5265                 DEBUG_EXECUTE_r(
5266                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5267                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5268                 if ( nochange_depth )
5269                     nochange_depth--;
5270
5271                 PUSH_YES_STATE_GOTO(EVAL_AB,
5272                         st->u.eval.prev_eval->u.eval.B); /* match B */
5273             }
5274
5275             if (locinput < reginfo->till) {
5276                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5277                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5278                                       PL_colors[4],
5279                                       (long)(locinput - PL_reg_starttry),
5280                                       (long)(reginfo->till - PL_reg_starttry),
5281                                       PL_colors[5]));
5282
5283                 sayNO_SILENT;           /* Cannot match: too short. */
5284             }
5285             PL_reginput = locinput;     /* put where regtry can find it */
5286             sayYES;                     /* Success! */
5287
5288         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5289             DEBUG_EXECUTE_r(
5290             PerlIO_printf(Perl_debug_log,
5291                 "%*s  %ssubpattern success...%s\n",
5292                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5293             PL_reginput = locinput;     /* put where regtry can find it */
5294             sayYES;                     /* Success! */
5295
5296 #undef  ST
5297 #define ST st->u.ifmatch
5298
5299         case SUSPEND:   /* (?>A) */
5300             ST.wanted = 1;
5301             PL_reginput = locinput;
5302             goto do_ifmatch;
5303
5304         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5305             ST.wanted = 0;
5306             goto ifmatch_trivial_fail_test;
5307
5308         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5309             ST.wanted = 1;
5310           ifmatch_trivial_fail_test:
5311             if (scan->flags) {
5312                 char * const s = HOPBACKc(locinput, scan->flags);
5313                 if (!s) {
5314                     /* trivial fail */
5315                     if (logical) {
5316                         logical = 0;
5317                         sw = 1 - cBOOL(ST.wanted);
5318                     }
5319                     else if (ST.wanted)
5320                         sayNO;
5321                     next = scan + ARG(scan);
5322                     if (next == scan)
5323                         next = NULL;
5324                     break;
5325                 }
5326                 PL_reginput = s;
5327             }
5328             else
5329                 PL_reginput = locinput;
5330
5331           do_ifmatch:
5332             ST.me = scan;
5333             ST.logical = logical;
5334             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5335
5336             /* execute body of (?...A) */
5337             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5338             /* NOTREACHED */
5339
5340         case IFMATCH_A_fail: /* body of (?...A) failed */
5341             ST.wanted = !ST.wanted;
5342             /* FALL THROUGH */
5343
5344         case IFMATCH_A: /* body of (?...A) succeeded */
5345             if (ST.logical) {
5346                 sw = cBOOL(ST.wanted);
5347             }
5348             else if (!ST.wanted)
5349                 sayNO;
5350
5351             if (OP(ST.me) == SUSPEND)
5352                 locinput = PL_reginput;
5353             else {
5354                 locinput = PL_reginput = st->locinput;
5355                 nextchr = UCHARAT(locinput);
5356             }
5357             scan = ST.me + ARG(ST.me);
5358             if (scan == ST.me)
5359                 scan = NULL;
5360             continue; /* execute B */
5361
5362 #undef ST
5363
5364         case LONGJMP:
5365             next = scan + ARG(scan);
5366             if (next == scan)
5367                 next = NULL;
5368             break;
5369         case COMMIT:
5370             reginfo->cutpoint = PL_regeol;
5371             /* FALLTHROUGH */
5372         case PRUNE:
5373             PL_reginput = locinput;
5374             if (!scan->flags)
5375                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5376             PUSH_STATE_GOTO(COMMIT_next,next);
5377             /* NOTREACHED */
5378         case COMMIT_next_fail:
5379             no_final = 1;
5380             /* FALLTHROUGH */
5381         case OPFAIL:
5382             sayNO;
5383             /* NOTREACHED */
5384
5385 #define ST st->u.mark
5386         case MARKPOINT:
5387             ST.prev_mark = mark_state;
5388             ST.mark_name = sv_commit = sv_yes_mark
5389                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5390             mark_state = st;
5391             ST.mark_loc = PL_reginput = locinput;
5392             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5393             /* NOTREACHED */
5394         case MARKPOINT_next:
5395             mark_state = ST.prev_mark;
5396             sayYES;
5397             /* NOTREACHED */
5398         case MARKPOINT_next_fail:
5399             if (popmark && sv_eq(ST.mark_name,popmark))
5400             {
5401                 if (ST.mark_loc > startpoint)
5402                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5403                 popmark = NULL; /* we found our mark */
5404                 sv_commit = ST.mark_name;
5405
5406                 DEBUG_EXECUTE_r({
5407                         PerlIO_printf(Perl_debug_log,
5408                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5409                             REPORT_CODE_OFF+depth*2, "",
5410                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5411                 });
5412             }
5413             mark_state = ST.prev_mark;
5414             sv_yes_mark = mark_state ?
5415                 mark_state->u.mark.mark_name : NULL;
5416             sayNO;
5417             /* NOTREACHED */
5418         case SKIP:
5419             PL_reginput = locinput;
5420             if (scan->flags) {
5421                 /* (*SKIP) : if we fail we cut here*/
5422                 ST.mark_name = NULL;
5423                 ST.mark_loc = locinput;
5424                 PUSH_STATE_GOTO(SKIP_next,next);
5425             } else {
5426                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5427                    otherwise do nothing.  Meaning we need to scan
5428                  */
5429                 regmatch_state *cur = mark_state;
5430                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5431
5432                 while (cur) {
5433                     if ( sv_eq( cur->u.mark.mark_name,
5434                                 find ) )
5435                     {
5436                         ST.mark_name = find;
5437                         PUSH_STATE_GOTO( SKIP_next, next );
5438                     }
5439                     cur = cur->u.mark.prev_mark;
5440                 }
5441             }
5442             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5443             break;
5444         case SKIP_next_fail:
5445             if (ST.mark_name) {
5446                 /* (*CUT:NAME) - Set up to search for the name as we
5447                    collapse the stack*/
5448                 popmark = ST.mark_name;
5449             } else {
5450                 /* (*CUT) - No name, we cut here.*/
5451                 if (ST.mark_loc > startpoint)
5452                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5453                 /* but we set sv_commit to latest mark_name if there
5454                    is one so they can test to see how things lead to this
5455                    cut */
5456                 if (mark_state)
5457                     sv_commit=mark_state->u.mark.mark_name;
5458             }
5459             no_final = 1;
5460             sayNO;
5461             /* NOTREACHED */
5462 #undef ST
5463         case FOLDCHAR:
5464             n = ARG(scan);
5465             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5466                 locinput += ln;
5467             } else if ( 0xDF == n && !utf8_target && !UTF_PATTERN ) {
5468                 sayNO;
5469             } else  {
5470                 U8 folded[UTF8_MAXBYTES_CASE+1];
5471                 STRLEN foldlen;
5472                 const char * const l = locinput;
5473                 char *e = PL_regeol;
5474                 to_uni_fold(n, folded, &foldlen);
5475
5476                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5477                                l, &e, 0,  utf8_target)) {
5478                         sayNO;
5479                 }
5480                 locinput = e;
5481             }
5482             nextchr = UCHARAT(locinput);
5483             break;
5484         case LNBREAK:
5485             if ((n=is_LNBREAK(locinput,utf8_target))) {
5486                 locinput += n;
5487                 nextchr = UCHARAT(locinput);
5488             } else
5489                 sayNO;
5490             break;
5491
5492 #define CASE_CLASS(nAmE)                              \
5493         case nAmE:                                    \
5494             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5495                 locinput += n;                        \
5496                 nextchr = UCHARAT(locinput);          \
5497             } else                                    \
5498                 sayNO;                                \
5499             break;                                    \
5500         case N##nAmE:                                 \
5501             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5502                 sayNO;                                \
5503             } else {                                  \
5504                 locinput += UTF8SKIP(locinput);       \
5505                 nextchr = UCHARAT(locinput);          \
5506             }                                         \
5507             break
5508
5509         CASE_CLASS(VERTWS);
5510         CASE_CLASS(HORIZWS);
5511 #undef CASE_CLASS
5512
5513         default:
5514             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5515                           PTR2UV(scan), OP(scan));
5516             Perl_croak(aTHX_ "regexp memory corruption");
5517
5518         } /* end switch */
5519
5520         /* switch break jumps here */
5521         scan = next; /* prepare to execute the next op and ... */
5522         continue;    /* ... jump back to the top, reusing st */
5523         /* NOTREACHED */
5524
5525       push_yes_state:
5526         /* push a state that backtracks on success */
5527         st->u.yes.prev_yes_state = yes_state;
5528         yes_state = st;
5529         /* FALL THROUGH */
5530       push_state:
5531         /* push a new regex state, then continue at scan  */
5532         {
5533             regmatch_state *newst;
5534
5535             DEBUG_STACK_r({
5536                 regmatch_state *cur = st;
5537                 regmatch_state *curyes = yes_state;
5538                 int curd = depth;
5539                 regmatch_slab *slab = PL_regmatch_slab;
5540                 for (;curd > -1;cur--,curd--) {
5541                     if (cur < SLAB_FIRST(slab)) {
5542                         slab = slab->prev;
5543                         cur = SLAB_LAST(slab);
5544                     }
5545                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5546                         REPORT_CODE_OFF + 2 + depth * 2,"",
5547                         curd, PL_reg_name[cur->resume_state],
5548                         (curyes == cur) ? "yes" : ""
5549                     );
5550                     if (curyes == cur)
5551                         curyes = cur->u.yes.prev_yes_state;
5552                 }
5553             } else
5554                 DEBUG_STATE_pp("push")
5555             );
5556             depth++;
5557             st->locinput = locinput;
5558             newst = st+1;
5559             if (newst >  SLAB_LAST(PL_regmatch_slab))
5560                 newst = S_push_slab(aTHX);
5561             PL_regmatch_state = newst;
5562
5563             locinput = PL_reginput;
5564             nextchr = UCHARAT(locinput);
5565             st = newst;
5566             continue;
5567             /* NOTREACHED */
5568         }
5569     }
5570
5571     /*
5572     * We get here only if there's trouble -- normally "case END" is
5573     * the terminating point.
5574     */
5575     Perl_croak(aTHX_ "corrupted regexp pointers");
5576     /*NOTREACHED*/
5577     sayNO;
5578
5579 yes:
5580     if (yes_state) {
5581         /* we have successfully completed a subexpression, but we must now
5582          * pop to the state marked by yes_state and continue from there */
5583         assert(st != yes_state);
5584 #ifdef DEBUGGING
5585         while (st != yes_state) {
5586             st--;
5587             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5588                 PL_regmatch_slab = PL_regmatch_slab->prev;
5589                 st = SLAB_LAST(PL_regmatch_slab);
5590             }
5591             DEBUG_STATE_r({
5592                 if (no_final) {
5593                     DEBUG_STATE_pp("pop (no final)");
5594                 } else {
5595                     DEBUG_STATE_pp("pop (yes)");
5596                 }
5597             });
5598             depth--;
5599         }
5600 #else
5601         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5602             || yes_state > SLAB_LAST(PL_regmatch_slab))
5603         {
5604             /* not in this slab, pop slab */
5605             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5606             PL_regmatch_slab = PL_regmatch_slab->prev;
5607             st = SLAB_LAST(PL_regmatch_slab);
5608         }
5609         depth -= (st - yes_state);
5610 #endif
5611         st = yes_state;
5612         yes_state = st->u.yes.prev_yes_state;
5613         PL_regmatch_state = st;
5614
5615         if (no_final) {
5616             locinput= st->locinput;
5617             nextchr = UCHARAT(locinput);
5618         }
5619         state_num = st->resume_state + no_final;
5620         goto reenter_switch;
5621     }
5622
5623     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5624                           PL_colors[4], PL_colors[5]));
5625
5626     if (PL_reg_eval_set) {
5627         /* each successfully executed (?{...}) block does the equivalent of
5628          *   local $^R = do {...}
5629          * When popping the save stack, all these locals would be undone;
5630          * bypass this by setting the outermost saved $^R to the latest
5631          * value */
5632         if (oreplsv != GvSV(PL_replgv))
5633             sv_setsv(oreplsv, GvSV(PL_replgv));
5634     }
5635     result = 1;
5636     goto final_exit;
5637
5638 no:
5639     DEBUG_EXECUTE_r(
5640         PerlIO_printf(Perl_debug_log,
5641             "%*s  %sfailed...%s\n",
5642             REPORT_CODE_OFF+depth*2, "",
5643             PL_colors[4], PL_colors[5])
5644         );
5645
5646 no_silent:
5647     if (no_final) {
5648         if (yes_state) {
5649             goto yes;
5650         } else {
5651             goto final_exit;
5652         }
5653     }
5654     if (depth) {
5655         /* there's a previous state to backtrack to */
5656         st--;
5657         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5658             PL_regmatch_slab = PL_regmatch_slab->prev;
5659             st = SLAB_LAST(PL_regmatch_slab);
5660         }
5661         PL_regmatch_state = st;
5662         locinput= st->locinput;
5663         nextchr = UCHARAT(locinput);
5664
5665         DEBUG_STATE_pp("pop");
5666         depth--;
5667         if (yes_state == st)
5668             yes_state = st->u.yes.prev_yes_state;
5669
5670         state_num = st->resume_state + 1; /* failure = success + 1 */
5671         goto reenter_switch;
5672     }
5673     result = 0;
5674
5675   final_exit:
5676     if (rex->intflags & PREGf_VERBARG_SEEN) {
5677         SV *sv_err = get_sv("REGERROR", 1);
5678         SV *sv_mrk = get_sv("REGMARK", 1);
5679         if (result) {
5680             sv_commit = &PL_sv_no;
5681             if (!sv_yes_mark)
5682                 sv_yes_mark = &PL_sv_yes;
5683         } else {
5684             if (!sv_commit)
5685                 sv_commit = &PL_sv_yes;
5686             sv_yes_mark = &PL_sv_no;
5687         }
5688         sv_setsv(sv_err, sv_commit);
5689         sv_setsv(sv_mrk, sv_yes_mark);
5690     }
5691
5692     /* clean up; in particular, free all slabs above current one */
5693     LEAVE_SCOPE(oldsave);
5694
5695     return result;
5696 }
5697
5698 /*
5699  - regrepeat - repeatedly match something simple, report how many
5700  */
5701 /*
5702  * [This routine now assumes that it will only match on things of length 1.
5703  * That was true before, but now we assume scan - reginput is the count,
5704  * rather than incrementing count on every character.  [Er, except utf8.]]
5705  */
5706 STATIC I32
5707 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5708 {
5709     dVAR;
5710     register char *scan;
5711     register I32 c;
5712     register char *loceol = PL_regeol;
5713     register I32 hardcount = 0;
5714     register bool utf8_target = PL_reg_match_utf8;
5715 #ifndef DEBUGGING
5716     PERL_UNUSED_ARG(depth);
5717 #endif
5718
5719     PERL_ARGS_ASSERT_REGREPEAT;
5720
5721     scan = PL_reginput;
5722     if (max == REG_INFTY)
5723         max = I32_MAX;
5724     else if (max < loceol - scan)
5725         loceol = scan + max;
5726     switch (OP(p)) {
5727     case REG_ANY:
5728         if (utf8_target) {
5729             loceol = PL_regeol;
5730             while (scan < loceol && hardcount < max && *scan != '\n') {
5731                 scan += UTF8SKIP(scan);
5732                 hardcount++;
5733             }
5734         } else {
5735             while (scan < loceol && *scan != '\n')
5736                 scan++;
5737         }
5738         break;
5739     case SANY:
5740         if (utf8_target) {
5741             loceol = PL_regeol;
5742             while (scan < loceol && hardcount < max) {
5743                 scan += UTF8SKIP(scan);
5744                 hardcount++;
5745             }
5746         }
5747         else
5748             scan = loceol;
5749         break;
5750     case CANY:
5751         scan = loceol;
5752         break;
5753     case EXACT:         /* length of string is 1 */
5754         c = (U8)*STRING(p);
5755         while (scan < loceol && UCHARAT(scan) == c)
5756             scan++;
5757         break;
5758     case EXACTF:        /* length of string is 1 */
5759         c = (U8)*STRING(p);
5760         while (scan < loceol &&
5761                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5762             scan++;
5763         break;
5764     case EXACTFL:       /* length of string is 1 */
5765         PL_reg_flags |= RF_tainted;
5766         c = (U8)*STRING(p);
5767         while (scan < loceol &&
5768                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5769             scan++;
5770         break;
5771     case ANYOF:
5772         if (utf8_target) {
5773             loceol = PL_regeol;
5774             while (hardcount < max && scan < loceol &&
5775                    reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
5776                 scan += UTF8SKIP(scan);
5777                 hardcount++;
5778             }
5779         } else {
5780             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5781                 scan++;
5782         }
5783         break;
5784     case ALNUM:
5785         if (utf8_target) {
5786             loceol = PL_regeol;
5787             LOAD_UTF8_CHARCLASS_ALNUM();
5788             while (hardcount < max && scan < loceol &&
5789                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5790             {
5791                 scan += UTF8SKIP(scan);
5792                 hardcount++;
5793             }
5794         } else if (FLAGS(p) & USE_UNI) {
5795             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
5796                 scan++;
5797             }
5798         } else {
5799             while (scan < loceol && isALNUM((U8) *scan)) {
5800                 scan++;
5801             }
5802         }
5803         break;
5804     case ALNUML:
5805         PL_reg_flags |= RF_tainted;
5806         if (utf8_target) {
5807             loceol = PL_regeol;
5808             while (hardcount < max && scan < loceol &&
5809                    isALNUM_LC_utf8((U8*)scan)) {
5810                 scan += UTF8SKIP(scan);
5811                 hardcount++;
5812             }
5813         } else {
5814             while (scan < loceol && isALNUM_LC(*scan))
5815                 scan++;
5816         }
5817         break;
5818     case NALNUM:
5819         if (utf8_target) {
5820             loceol = PL_regeol;
5821             LOAD_UTF8_CHARCLASS_ALNUM();
5822             while (hardcount < max && scan < loceol &&
5823                    !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5824             {
5825                 scan += UTF8SKIP(scan);
5826                 hardcount++;
5827             }
5828         } else if (FLAGS(p) & USE_UNI) {
5829             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
5830                 scan++;
5831             }
5832         } else {
5833             while (scan < loceol && ! isALNUM((U8) *scan)) {
5834                 scan++;
5835             }
5836         }
5837         break;
5838     case NALNUML:
5839         PL_reg_flags |= RF_tainted;
5840         if (utf8_target) {
5841             loceol = PL_regeol;
5842             while (hardcount < max && scan < loceol &&
5843                    !isALNUM_LC_utf8((U8*)scan)) {
5844                 scan += UTF8SKIP(scan);
5845                 hardcount++;
5846             }
5847         } else {
5848             while (scan < loceol && !isALNUM_LC(*scan))
5849                 scan++;
5850         }
5851         break;
5852     case SPACE:
5853         if (utf8_target) {
5854             loceol = PL_regeol;
5855             LOAD_UTF8_CHARCLASS_SPACE();
5856             while (hardcount < max && scan < loceol &&
5857                    (*scan == ' ' ||
5858                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5859             {
5860                 scan += UTF8SKIP(scan);
5861                 hardcount++;
5862             }
5863         } else if (FLAGS(p) & USE_UNI) {
5864             while (scan < loceol && isSPACE_L1((U8) *scan)) {
5865                 scan++;
5866             }
5867         } else {
5868             while (scan < loceol && isSPACE((U8) *scan))
5869                 scan++;
5870         }
5871         break;
5872     case SPACEL:
5873         PL_reg_flags |= RF_tainted;
5874         if (utf8_target) {
5875             loceol = PL_regeol;
5876             while (hardcount < max && scan < loceol &&
5877                    (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5878                 scan += UTF8SKIP(scan);
5879                 hardcount++;
5880             }
5881         } else {
5882             while (scan < loceol && isSPACE_LC(*scan))
5883                 scan++;
5884         }
5885         break;
5886     case NSPACE:
5887         if (utf8_target) {
5888             loceol = PL_regeol;
5889             LOAD_UTF8_CHARCLASS_SPACE();
5890             while (hardcount < max && scan < loceol &&
5891                    !(*scan == ' ' ||
5892                      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5893             {
5894                 scan += UTF8SKIP(scan);
5895                 hardcount++;
5896             }
5897         } else if (FLAGS(p) & USE_UNI) {
5898             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
5899                 scan++;
5900             }
5901         } else {
5902             while (scan < loceol && ! isSPACE((U8) *scan)) {
5903                 scan++;
5904             }
5905         }
5906         break;
5907     case NSPACEL:
5908         PL_reg_flags |= RF_tainted;
5909         if (utf8_target) {
5910             loceol = PL_regeol;
5911             while (hardcount < max && scan < loceol &&
5912                    !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5913                 scan += UTF8SKIP(scan);
5914                 hardcount++;
5915             }
5916         } else {
5917             while (scan < loceol && !isSPACE_LC(*scan))
5918                 scan++;
5919         }
5920         break;
5921     case DIGIT:
5922         if (utf8_target) {
5923             loceol = PL_regeol;
5924             LOAD_UTF8_CHARCLASS_DIGIT();
5925             while (hardcount < max && scan < loceol &&
5926                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
5927                 scan += UTF8SKIP(scan);
5928                 hardcount++;
5929             }
5930         } else {
5931             while (scan < loceol && isDIGIT(*scan))
5932                 scan++;
5933         }
5934         break;
5935     case NDIGIT:
5936         if (utf8_target) {
5937             loceol = PL_regeol;
5938             LOAD_UTF8_CHARCLASS_DIGIT();
5939             while (hardcount < max && scan < loceol &&
5940                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
5941                 scan += UTF8SKIP(scan);
5942                 hardcount++;
5943             }
5944         } else {
5945             while (scan < loceol && !isDIGIT(*scan))
5946                 scan++;
5947         }
5948     case LNBREAK:
5949         if (utf8_target) {
5950             loceol = PL_regeol;
5951             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5952                 scan += c;
5953                 hardcount++;
5954             }
5955         } else {
5956             /*
5957               LNBREAK can match two latin chars, which is ok,
5958               because we have a null terminated string, but we
5959               have to use hardcount in this situation
5960             */
5961             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
5962                 scan+=c;
5963                 hardcount++;
5964             }
5965         }
5966         break;
5967     case HORIZWS:
5968         if (utf8_target) {
5969             loceol = PL_regeol;
5970             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5971                 scan += c;
5972                 hardcount++;
5973             }
5974         } else {
5975             while (scan < loceol && is_HORIZWS_latin1(scan))
5976                 scan++;
5977         }
5978         break;
5979     case NHORIZWS:
5980         if (utf8_target) {
5981             loceol = PL_regeol;
5982             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5983                 scan += UTF8SKIP(scan);
5984                 hardcount++;
5985             }
5986         } else {
5987             while (scan < loceol && !is_HORIZWS_latin1(scan))
5988                 scan++;
5989
5990         }
5991         break;
5992     case VERTWS:
5993         if (utf8_target) {
5994             loceol = PL_regeol;
5995             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5996                 scan += c;
5997                 hardcount++;
5998             }
5999         } else {
6000             while (scan < loceol && is_VERTWS_latin1(scan))
6001                 scan++;
6002
6003         }
6004         break;
6005     case NVERTWS:
6006         if (utf8_target) {
6007             loceol = PL_regeol;
6008             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6009                 scan += UTF8SKIP(scan);
6010                 hardcount++;
6011             }
6012         } else {
6013             while (scan < loceol && !is_VERTWS_latin1(scan))
6014                 scan++;
6015
6016         }
6017         break;
6018
6019     default:            /* Called on something of 0 width. */
6020         break;          /* So match right here or not at all. */
6021     }
6022
6023     if (hardcount)
6024         c = hardcount;
6025     else
6026         c = scan - PL_reginput;
6027     PL_reginput = scan;
6028
6029     DEBUG_r({
6030         GET_RE_DEBUG_FLAGS_DECL;
6031         DEBUG_EXECUTE_r({
6032             SV * const prop = sv_newmortal();
6033             regprop(prog, prop, p);
6034             PerlIO_printf(Perl_debug_log,
6035                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6036                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6037         });
6038     });
6039
6040     return(c);
6041 }
6042
6043
6044 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6045 /*
6046 - regclass_swash - prepare the utf8 swash
6047 */
6048
6049 SV *
6050 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6051 {
6052     dVAR;
6053     SV *sw  = NULL;
6054     SV *si  = NULL;
6055     SV *alt = NULL;
6056     RXi_GET_DECL(prog,progi);
6057     const struct reg_data * const data = prog ? progi->data : NULL;
6058
6059     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6060
6061     if (data && data->count) {
6062         const U32 n = ARG(node);
6063
6064         if (data->what[n] == 's') {
6065             SV * const rv = MUTABLE_SV(data->data[n]);
6066             AV * const av = MUTABLE_AV(SvRV(rv));
6067             SV **const ary = AvARRAY(av);
6068             SV **a, **b;
6069
6070             /* See the end of regcomp.c:S_regclass() for
6071              * documentation of these array elements. */
6072
6073             si = *ary;
6074             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6075             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6076
6077             if (a)
6078                 sw = *a;
6079             else if (si && doinit) {
6080                 sw = swash_init("utf8", "", si, 1, 0);
6081                 (void)av_store(av, 1, sw);
6082             }
6083             if (b)
6084                 alt = *b;
6085         }
6086     }
6087
6088     if (listsvp)
6089         *listsvp = si;
6090     if (altsvp)
6091         *altsvp  = alt;
6092
6093     return sw;
6094 }
6095 #endif
6096
6097 /*
6098  - reginclass - determine if a character falls into a character class
6099
6100   The n is the ANYOF regnode, the p is the target string, lenp
6101   is pointer to the maximum length of how far to go in the p
6102   (if the lenp is zero, UTF8SKIP(p) is used),
6103   utf8_target tells whether the target string is in UTF-8.
6104
6105  */
6106
6107 STATIC bool
6108 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool utf8_target)
6109 {
6110     dVAR;
6111     const char flags = ANYOF_FLAGS(n);
6112     bool match = FALSE;
6113     UV c = *p;
6114     STRLEN len = 0;
6115     STRLEN plen;
6116
6117     PERL_ARGS_ASSERT_REGINCLASS;
6118
6119     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6120         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
6121                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6122                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6123                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6124                  * UTF8_ALLOW_FFFF */
6125         if (len == (STRLEN)-1)
6126             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6127     }
6128
6129     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
6130     if (utf8_target || (flags & ANYOF_UNICODE)) {
6131         if (lenp)
6132             *lenp = 0;
6133         if (utf8_target && !ANYOF_RUNTIME(n)) {
6134             if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
6135                 match = TRUE;
6136         }
6137         if (!match && utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256)
6138             match = TRUE;
6139         if (!match) {
6140             AV *av;
6141             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6142
6143             if (sw) {
6144                 U8 * utf8_p;
6145                 if (utf8_target) {
6146                     utf8_p = (U8 *) p;
6147                 } else {
6148                     STRLEN len = 1;
6149                     utf8_p = bytes_to_utf8(p, &len);
6150                 }
6151                 if (swash_fetch(sw, utf8_p, 1))
6152                     match = TRUE;
6153                 else if (flags & ANYOF_FOLD) {
6154                     if (!match && lenp && av) {
6155                         I32 i;
6156                         for (i = 0; i <= av_len(av); i++) {
6157                             SV* const sv = *av_fetch(av, i, FALSE);
6158                             STRLEN len;
6159                             const char * const s = SvPV_const(sv, len);
6160                             if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
6161                                 *lenp = len;
6162                                 match = TRUE;
6163                                 break;
6164                             }
6165                         }
6166                     }
6167                     if (!match) {
6168                         U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
6169
6170                         STRLEN tmplen;
6171                         to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6172                         if (swash_fetch(sw, tmpbuf, 1))
6173                             match = TRUE;
6174                     }
6175                 }
6176
6177                 /* If we allocated a string above, free it */
6178                 if (! utf8_target) Safefree(utf8_p);
6179             }
6180         }
6181         if (match && lenp && *lenp == 0)
6182             *lenp = UNISKIP(NATIVE_TO_UNI(c));
6183     }
6184     if (!match && c < 256) {
6185         if (ANYOF_BITMAP_TEST(n, c))
6186             match = TRUE;
6187         else if (flags & ANYOF_FOLD) {
6188             U8 f;
6189
6190             if (flags & ANYOF_LOCALE) {
6191                 PL_reg_flags |= RF_tainted;
6192                 f = PL_fold_locale[c];
6193             }
6194             else
6195                 f = PL_fold[c];
6196             if (f != c && ANYOF_BITMAP_TEST(n, f))
6197                 match = TRUE;
6198         }
6199
6200         if (!match && (flags & ANYOF_CLASS)) {
6201             PL_reg_flags |= RF_tainted;
6202             if (
6203                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6204                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6205                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6206                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6207                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6208                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6209                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6210                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6211                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6212                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6213                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6214                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6215                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6216                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6217                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6218                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6219                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6220                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6221                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6222                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6223                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6224                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6225                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6226                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6227                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6228                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6229                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6230                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6231                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6232                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6233                 ) /* How's that for a conditional? */
6234             {
6235                 match = TRUE;
6236             }
6237         }
6238     }
6239
6240     return (flags & ANYOF_INVERT) ? !match : match;
6241 }
6242
6243 STATIC U8 *
6244 S_reghop3(U8 *s, I32 off, const U8* lim)
6245 {
6246     dVAR;
6247
6248     PERL_ARGS_ASSERT_REGHOP3;
6249
6250     if (off >= 0) {
6251         while (off-- && s < lim) {
6252             /* XXX could check well-formedness here */
6253             s += UTF8SKIP(s);
6254         }
6255     }
6256     else {
6257         while (off++ && s > lim) {
6258             s--;
6259             if (UTF8_IS_CONTINUED(*s)) {
6260                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6261                     s--;
6262             }
6263             /* XXX could check well-formedness here */
6264         }
6265     }
6266     return s;
6267 }
6268
6269 #ifdef XXX_dmq
6270 /* there are a bunch of places where we use two reghop3's that should
6271    be replaced with this routine. but since thats not done yet
6272    we ifdef it out - dmq
6273 */
6274 STATIC U8 *
6275 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6276 {
6277     dVAR;
6278
6279     PERL_ARGS_ASSERT_REGHOP4;
6280
6281     if (off >= 0) {
6282         while (off-- && s < rlim) {
6283             /* XXX could check well-formedness here */
6284             s += UTF8SKIP(s);
6285         }
6286     }
6287     else {
6288         while (off++ && s > llim) {
6289             s--;
6290             if (UTF8_IS_CONTINUED(*s)) {
6291                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6292                     s--;
6293             }
6294             /* XXX could check well-formedness here */
6295         }
6296     }
6297     return s;
6298 }
6299 #endif
6300
6301 STATIC U8 *
6302 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6303 {
6304     dVAR;
6305
6306     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6307
6308     if (off >= 0) {
6309         while (off-- && s < lim) {
6310             /* XXX could check well-formedness here */
6311             s += UTF8SKIP(s);
6312         }
6313         if (off >= 0)
6314             return NULL;
6315     }
6316     else {
6317         while (off++ && s > lim) {
6318             s--;
6319             if (UTF8_IS_CONTINUED(*s)) {
6320                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6321                     s--;
6322             }
6323             /* XXX could check well-formedness here */
6324         }
6325         if (off <= 0)
6326             return NULL;
6327     }
6328     return s;
6329 }
6330
6331 static void
6332 restore_pos(pTHX_ void *arg)
6333 {
6334     dVAR;
6335     regexp * const rex = (regexp *)arg;
6336     if (PL_reg_eval_set) {
6337         if (PL_reg_oldsaved) {
6338             rex->subbeg = PL_reg_oldsaved;
6339             rex->sublen = PL_reg_oldsavedlen;
6340 #ifdef PERL_OLD_COPY_ON_WRITE
6341             rex->saved_copy = PL_nrs;
6342 #endif
6343             RXp_MATCH_COPIED_on(rex);
6344         }
6345         PL_reg_magic->mg_len = PL_reg_oldpos;
6346         PL_reg_eval_set = 0;
6347         PL_curpm = PL_reg_oldcurpm;
6348     }
6349 }
6350
6351 STATIC void
6352 S_to_utf8_substr(pTHX_ register regexp *prog)
6353 {
6354     int i = 1;
6355
6356     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6357
6358     do {
6359         if (prog->substrs->data[i].substr
6360             && !prog->substrs->data[i].utf8_substr) {
6361             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6362             prog->substrs->data[i].utf8_substr = sv;
6363             sv_utf8_upgrade(sv);
6364             if (SvVALID(prog->substrs->data[i].substr)) {
6365                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6366                 if (flags & FBMcf_TAIL) {
6367                     /* Trim the trailing \n that fbm_compile added last
6368                        time.  */
6369                     SvCUR_set(sv, SvCUR(sv) - 1);
6370                     /* Whilst this makes the SV technically "invalid" (as its
6371                        buffer is no longer followed by "\0") when fbm_compile()
6372                        adds the "\n" back, a "\0" is restored.  */
6373                 }
6374                 fbm_compile(sv, flags);
6375             }
6376             if (prog->substrs->data[i].substr == prog->check_substr)
6377                 prog->check_utf8 = sv;
6378         }
6379     } while (i--);
6380 }
6381
6382 STATIC void
6383 S_to_byte_substr(pTHX_ register regexp *prog)
6384 {
6385     dVAR;
6386     int i = 1;
6387
6388     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6389
6390     do {
6391         if (prog->substrs->data[i].utf8_substr
6392             && !prog->substrs->data[i].substr) {
6393             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6394             if (sv_utf8_downgrade(sv, TRUE)) {
6395                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6396                     const U8 flags
6397                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6398                     if (flags & FBMcf_TAIL) {
6399                         /* Trim the trailing \n that fbm_compile added last
6400                            time.  */
6401                         SvCUR_set(sv, SvCUR(sv) - 1);
6402                     }
6403                     fbm_compile(sv, flags);
6404                 }
6405             } else {
6406                 SvREFCNT_dec(sv);
6407                 sv = &PL_sv_undef;
6408             }
6409             prog->substrs->data[i].substr = sv;
6410             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6411                 prog->check_substr = sv;
6412         }
6413     } while (i--);
6414 }
6415
6416 /*
6417  * Local variables:
6418  * c-indentation-style: bsd
6419  * c-basic-offset: 4
6420  * indent-tabs-mode: t
6421  * End:
6422  *
6423  * ex: set ts=8 sts=4 sw=4 noet:
6424  */