src/5013008/orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1               /* tainted information used? */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings only: avoids the reginclass call if there are no
  98  * complications: i.e., if everything matchable is straight forward in the
  99  * bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140             * assert should likely and hopefully fail on an EBCDIC machine */ \
 141         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                                                                             \
 143         /* No asserts are done for these, in case called on an early        \
 144             * Unicode version in which they map to nothing */               \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 /*
 154    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 155    so that it is possible to override the option here without having to
 156    rebuild the entire core. as we are required to do if we change regcomp.h
 157    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 158 */
 159 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 160 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 161 #endif
 162
 163 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 164 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 165 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 166 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 167 #define RE_utf8_perl_word   PL_utf8_alnum
 168 #define RE_utf8_perl_space  PL_utf8_space
 169 #define RE_utf8_posix_digit PL_utf8_digit
 170 #define perl_word  alnum
 171 #define perl_space space
 172 #define posix_digit digit
 173 #else
 174 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 175 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 176 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 177 #define RE_utf8_perl_word   PL_utf8_perl_word
 178 #define RE_utf8_perl_space  PL_utf8_perl_space
 179 #define RE_utf8_posix_digit PL_utf8_posix_digit
 180 #endif
 181
 182
 183 #define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
 184         case NAMEL:                                                         \
 185             PL_reg_flags |= RF_tainted;                                     \
 186             /* FALL THROUGH */                                              \
 187         case NAME:                                                          \
 188             if (!nextchr)                                                   \
 189                 sayNO;                                                      \
 190             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
 191                 if (!CAT2(PL_utf8_,CLASS)) {                                \
 192                     bool ok;                                                \
 193                     ENTER;                                                  \
 194                     save_re_context();                                      \
 195                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
 196                     assert(ok);                                             \
 197                     LEAVE;                                                  \
 198                 }                                                           \
 199                 if (!(OP(scan) == NAME                                      \
 200                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 201                     : LCFUNC_utf8((U8*)locinput)))                          \
 202                 {                                                           \
 203                     sayNO;                                                  \
 204                 }                                                           \
 205                 locinput += PL_utf8skip[nextchr];                           \
 206                 nextchr = UCHARAT(locinput);                                \
 207                 break;                                                      \
 208             }                                                               \
 209             /* Drops through to the macro that calls this one */
 210
 211 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
 212     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
 213             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
 214                 sayNO;                                                      \
 215             nextchr = UCHARAT(++locinput);                                  \
 216             break
 217
 218 /* Almost identical to the above, but has a case for a node that matches chars
 219  * between 128 and 255 using Unicode (latin1) semantics. */
 220 #define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 221     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 222             if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 223                 sayNO;                                                       \
 224             nextchr = UCHARAT(++locinput);                                   \
 225             break
 226
 227 #define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
 228         case NAMEL:                                                          \
 229             PL_reg_flags |= RF_tainted;                                      \
 230             /* FALL THROUGH */                                               \
 231         case NAME :                                                          \
 232             if (!nextchr && locinput >= PL_regeol)                           \
 233                 sayNO;                                                       \
 234             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
 235                 if (!CAT2(PL_utf8_,CLASS)) {                                 \
 236                     bool ok;                                                 \
 237                     ENTER;                                                   \
 238                     save_re_context();                                       \
 239                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
 240                     assert(ok);                                              \
 241                     LEAVE;                                                   \
 242                 }                                                            \
 243                 if ((OP(scan) == NAME                                        \
 244                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 245                     : LCFUNC_utf8((U8*)locinput)))                           \
 246                 {                                                            \
 247                     sayNO;                                                   \
 248                 }                                                            \
 249                 locinput += PL_utf8skip[nextchr];                            \
 250                 nextchr = UCHARAT(locinput);                                 \
 251                 break;                                                       \
 252             }
 253
 254 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
 255     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 256             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
 257                 sayNO;                                                       \
 258             nextchr = UCHARAT(++locinput);                                   \
 259             break
 260
 261
 262 #define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 263     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
 264             if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 265                 sayNO;                                                       \
 266             nextchr = UCHARAT(++locinput);                                   \
 267             break
 268
 269
 270
 271 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 272
 273 /* for use after a quantifier and before an EXACT-like node -- japhy */
 274 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 275  *
 276  * NOTE that *nothing* that affects backtracking should be in here, specifically
 277  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 278  * node that is in between two EXACT like nodes when ascertaining what the required
 279  * "follow" character is. This should probably be moved to regex compile time
 280  * although it may be done at run time beause of the REF possibility - more
 281  * investigation required. -- demerphq
 282 */
 283 #define JUMPABLE(rn) (      \
 284     OP(rn) == OPEN ||       \
 285     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 286     OP(rn) == EVAL ||   \
 287     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 288     OP(rn) == PLUS || OP(rn) == MINMOD || \
 289     OP(rn) == KEEPS || \
 290     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 291 )
 292 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 293
 294 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 295
 296 #if 0
 297 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 298    we don't need this definition. */
 299 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 300 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 301 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 302
 303 #else
 304 /* ... so we use this as its faster. */
 305 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 306 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU )
 307 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 308 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 309
 310 #endif
 311
 312 /*
 313   Search for mandatory following text node; for lookahead, the text must
 314   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 315 */
 316 #define FIND_NEXT_IMPT(rn) STMT_START { \
 317     while (JUMPABLE(rn)) { \
 318         const OPCODE type = OP(rn); \
 319         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 320             rn = NEXTOPER(NEXTOPER(rn)); \
 321         else if (type == PLUS) \
 322             rn = NEXTOPER(rn); \
 323         else if (type == IFMATCH) \
 324             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 325         else rn += NEXT_OFF(rn); \
 326     } \
 327 } STMT_END
 328
 329
 330 static void restore_pos(pTHX_ void *arg);
 331
 332 #define REGCP_PAREN_ELEMS 4
 333 #define REGCP_OTHER_ELEMS 5
 334 #define REGCP_FRAME_ELEMS 1
 335 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 336  * are needed for the regexp context stack bookkeeping. */
 337
 338 STATIC CHECKPOINT
 339 S_regcppush(pTHX_ I32 parenfloor)
 340 {
 341     dVAR;
 342     const int retval = PL_savestack_ix;
 343     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 344     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 345     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 346     int p;
 347     GET_RE_DEBUG_FLAGS_DECL;
 348
 349     if (paren_elems_to_push < 0)
 350         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 351
 352     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 353         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 354                    " out of range (%lu-%ld)",
 355                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 356
 357     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 358
 359     for (p = PL_regsize; p > parenfloor; p--) {
 360 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 361         SSPUSHINT(PL_regoffs[p].end);
 362         SSPUSHINT(PL_regoffs[p].start);
 363         SSPUSHPTR(PL_reg_start_tmp[p]);
 364         SSPUSHINT(p);
 365         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 366           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 367                       (UV)p, (IV)PL_regoffs[p].start,
 368                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 369                       (IV)PL_regoffs[p].end
 370         ));
 371     }
 372 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 373     SSPUSHPTR(PL_regoffs);
 374     SSPUSHINT(PL_regsize);
 375     SSPUSHINT(*PL_reglastparen);
 376     SSPUSHINT(*PL_reglastcloseparen);
 377     SSPUSHPTR(PL_reginput);
 378     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 379
 380     return retval;
 381 }
 382
 383 /* These are needed since we do not localize EVAL nodes: */
 384 #define REGCP_SET(cp)                                           \
 385     DEBUG_STATE_r(                                              \
 386             PerlIO_printf(Perl_debug_log,                       \
 387                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 388                 (IV)PL_savestack_ix));                          \
 389     cp = PL_savestack_ix
 390
 391 #define REGCP_UNWIND(cp)                                        \
 392     DEBUG_STATE_r(                                              \
 393         if (cp != PL_savestack_ix)                              \
 394             PerlIO_printf(Perl_debug_log,                       \
 395                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 396                 (IV)(cp), (IV)PL_savestack_ix));                \
 397     regcpblow(cp)
 398
 399 STATIC char *
 400 S_regcppop(pTHX_ const regexp *rex)
 401 {
 402     dVAR;
 403     UV i;
 404     char *input;
 405     GET_RE_DEBUG_FLAGS_DECL;
 406
 407     PERL_ARGS_ASSERT_REGCPPOP;
 408
 409     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 410     i = SSPOPUV;
 411     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 412     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 413     input = (char *) SSPOPPTR;
 414     *PL_reglastcloseparen = SSPOPINT;
 415     *PL_reglastparen = SSPOPINT;
 416     PL_regsize = SSPOPINT;
 417     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 418
 419     i -= REGCP_OTHER_ELEMS;
 420     /* Now restore the parentheses context. */
 421     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 422         I32 tmps;
 423         U32 paren = (U32)SSPOPINT;
 424         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 425         PL_regoffs[paren].start = SSPOPINT;
 426         tmps = SSPOPINT;
 427         if (paren <= *PL_reglastparen)
 428             PL_regoffs[paren].end = tmps;
 429         DEBUG_BUFFERS_r(
 430             PerlIO_printf(Perl_debug_log,
 431                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 432                           (UV)paren, (IV)PL_regoffs[paren].start,
 433                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 434                           (IV)PL_regoffs[paren].end,
 435                           (paren > *PL_reglastparen ? "(no)" : ""));
 436         );
 437     }
 438     DEBUG_BUFFERS_r(
 439         if (*PL_reglastparen + 1 <= rex->nparens) {
 440             PerlIO_printf(Perl_debug_log,
 441                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 442                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 443         }
 444     );
 445 #if 1
 446     /* It would seem that the similar code in regtry()
 447      * already takes care of this, and in fact it is in
 448      * a better location to since this code can #if 0-ed out
 449      * but the code in regtry() is needed or otherwise tests
 450      * requiring null fields (pat.t#187 and split.t#{13,14}
 451      * (as of patchlevel 7877)  will fail.  Then again,
 452      * this code seems to be necessary or otherwise
 453      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 454      * --jhi updated by dapm */
 455     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 456         if (i > PL_regsize)
 457             PL_regoffs[i].start = -1;
 458         PL_regoffs[i].end = -1;
 459     }
 460 #endif
 461     return input;
 462 }
 463
 464 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 465
 466 /*
 467  * pregexec and friends
 468  */
 469
 470 #ifndef PERL_IN_XSUB_RE
 471 /*
 472  - pregexec - match a regexp against a string
 473  */
 474 I32
 475 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 476          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 477 /* strend: pointer to null at end of string */
 478 /* strbeg: real beginning of string */
 479 /* minend: end of match must be >=minend after stringarg. */
 480 /* nosave: For optimizations. */
 481 {
 482     PERL_ARGS_ASSERT_PREGEXEC;
 483
 484     return
 485         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 486                       nosave ? 0 : REXEC_COPY_STR);
 487 }
 488 #endif
 489
 490 /*
 491  * Need to implement the following flags for reg_anch:
 492  *
 493  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 494  * USE_INTUIT_ML
 495  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 496  * INTUIT_AUTORITATIVE_ML
 497  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 498  * INTUIT_ONCE_ML
 499  *
 500  * Another flag for this function: SECOND_TIME (so that float substrs
 501  * with giant delta may be not rechecked).
 502  */
 503
 504 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 505
 506 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 507    Otherwise, only SvCUR(sv) is used to get strbeg. */
 508
 509 /* XXXX We assume that strpos is strbeg unless sv. */
 510
 511 /* XXXX Some places assume that there is a fixed substring.
 512         An update may be needed if optimizer marks as "INTUITable"
 513         RExen without fixed substrings.  Similarly, it is assumed that
 514         lengths of all the strings are no more than minlen, thus they
 515         cannot come from lookahead.
 516         (Or minlen should take into account lookahead.)
 517   NOTE: Some of this comment is not correct. minlen does now take account
 518   of lookahead/behind. Further research is required. -- demerphq
 519
 520 */
 521
 522 /* A failure to find a constant substring means that there is no need to make
 523    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 524    finding a substring too deep into the string means that less calls to
 525    regtry() should be needed.
 526
 527    REx compiler's optimizer found 4 possible hints:
 528         a) Anchored substring;
 529         b) Fixed substring;
 530         c) Whether we are anchored (beginning-of-line or \G);
 531         d) First node (of those at offset 0) which may distingush positions;
 532    We use a)b)d) and multiline-part of c), and try to find a position in the
 533    string which does not contradict any of them.
 534  */
 535
 536 /* Most of decisions we do here should have been done at compile time.
 537    The nodes of the REx which we used for the search should have been
 538    deleted from the finite automaton. */
 539
 540 char *
 541 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 542                      char *strend, const U32 flags, re_scream_pos_data *data)
 543 {
 544     dVAR;
 545     struct regexp *const prog = (struct regexp *)SvANY(rx);
 546     register I32 start_shift = 0;
 547     /* Should be nonnegative! */
 548     register I32 end_shift   = 0;
 549     register char *s;
 550     register SV *check;
 551     char *strbeg;
 552     char *t;
 553     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 554     I32 ml_anch;
 555     register char *other_last = NULL;   /* other substr checked before this */
 556     char *check_at = NULL;              /* check substr found at this pos */
 557     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 558     RXi_GET_DECL(prog,progi);
 559 #ifdef DEBUGGING
 560     const char * const i_strpos = strpos;
 561 #endif
 562     GET_RE_DEBUG_FLAGS_DECL;
 563
 564     PERL_ARGS_ASSERT_RE_INTUIT_START;
 565
 566     RX_MATCH_UTF8_set(rx,utf8_target);
 567
 568     if (RX_UTF8(rx)) {
 569         PL_reg_flags |= RF_utf8;
 570     }
 571     DEBUG_EXECUTE_r(
 572         debug_start_match(rx, utf8_target, strpos, strend,
 573             sv ? "Guessing start of match in sv for"
 574                : "Guessing start of match in string for");
 575               );
 576
 577     /* CHR_DIST() would be more correct here but it makes things slow. */
 578     if (prog->minlen > strend - strpos) {
 579         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 580                               "String too short... [re_intuit_start]\n"));
 581         goto fail;
 582     }
 583
 584     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 585     PL_regeol = strend;
 586     if (utf8_target) {
 587         if (!prog->check_utf8 && prog->check_substr)
 588             to_utf8_substr(prog);
 589         check = prog->check_utf8;
 590     } else {
 591         if (!prog->check_substr && prog->check_utf8)
 592             to_byte_substr(prog);
 593         check = prog->check_substr;
 594     }
 595     if (check == &PL_sv_undef) {
 596         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 597                 "Non-utf8 string cannot match utf8 check string\n"));
 598         goto fail;
 599     }
 600     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 601         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 602                      || ( (prog->extflags & RXf_ANCH_BOL)
 603                           && !multiline ) );    /* Check after \n? */
 604
 605         if (!ml_anch) {
 606           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 607                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 608                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 609                && sv && !SvROK(sv)
 610                && (strpos != strbeg)) {
 611               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 612               goto fail;
 613           }
 614           if (prog->check_offset_min == prog->check_offset_max &&
 615               !(prog->extflags & RXf_CANY_SEEN)) {
 616             /* Substring at constant offset from beg-of-str... */
 617             I32 slen;
 618
 619             s = HOP3c(strpos, prog->check_offset_min, strend);
 620
 621             if (SvTAIL(check)) {
 622                 slen = SvCUR(check);    /* >= 1 */
 623
 624                 if ( strend - s > slen || strend - s < slen - 1
 625                      || (strend - s == slen && strend[-1] != '\n')) {
 626                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 627                     goto fail_finish;
 628                 }
 629                 /* Now should match s[0..slen-2] */
 630                 slen--;
 631                 if (slen && (*SvPVX_const(check) != *s
 632                              || (slen > 1
 633                                  && memNE(SvPVX_const(check), s, slen)))) {
 634                   report_neq:
 635                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 636                     goto fail_finish;
 637                 }
 638             }
 639             else if (*SvPVX_const(check) != *s
 640                      || ((slen = SvCUR(check)) > 1
 641                          && memNE(SvPVX_const(check), s, slen)))
 642                 goto report_neq;
 643             check_at = s;
 644             goto success_at_start;
 645           }
 646         }
 647         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 648         s = strpos;
 649         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 650         end_shift = prog->check_end_shift;
 651
 652         if (!ml_anch) {
 653             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 654                                          - (SvTAIL(check) != 0);
 655             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 656
 657             if (end_shift < eshift)
 658                 end_shift = eshift;
 659         }
 660     }
 661     else {                              /* Can match at random position */
 662         ml_anch = 0;
 663         s = strpos;
 664         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 665         end_shift = prog->check_end_shift;
 666
 667         /* end shift should be non negative here */
 668     }
 669
 670 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 671     if (end_shift < 0)
 672         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 673                    (IV)end_shift, RX_PRECOMP(prog));
 674 #endif
 675
 676   restart:
 677     /* Find a possible match in the region s..strend by looking for
 678        the "check" substring in the region corrected by start/end_shift. */
 679
 680     {
 681         I32 srch_start_shift = start_shift;
 682         I32 srch_end_shift = end_shift;
 683         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 684             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 685             srch_start_shift = strbeg - s;
 686         }
 687     DEBUG_OPTIMISE_MORE_r({
 688         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 689             (IV)prog->check_offset_min,
 690             (IV)srch_start_shift,
 691             (IV)srch_end_shift,
 692             (IV)prog->check_end_shift);
 693     });
 694
 695     if (flags & REXEC_SCREAM) {
 696         I32 p = -1;                     /* Internal iterator of scream. */
 697         I32 * const pp = data ? data->scream_pos : &p;
 698
 699         if (PL_screamfirst[BmRARE(check)] >= 0
 700             || ( BmRARE(check) == '\n'
 701                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 702                  && SvTAIL(check) ))
 703             s = screaminstr(sv, check,
 704                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 705         else
 706             goto fail_finish;
 707         /* we may be pointing at the wrong string */
 708         if (s && RXp_MATCH_COPIED(prog))
 709             s = strbeg + (s - SvPVX_const(sv));
 710         if (data)
 711             *data->scream_olds = s;
 712     }
 713     else {
 714         U8* start_point;
 715         U8* end_point;
 716         if (prog->extflags & RXf_CANY_SEEN) {
 717             start_point= (U8*)(s + srch_start_shift);
 718             end_point= (U8*)(strend - srch_end_shift);
 719         } else {
 720             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 721             end_point= HOP3(strend, -srch_end_shift, strbeg);
 722         }
 723         DEBUG_OPTIMISE_MORE_r({
 724             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 725                 (int)(end_point - start_point),
 726                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 727                 start_point);
 728         });
 729
 730         s = fbm_instr( start_point, end_point,
 731                       check, multiline ? FBMrf_MULTILINE : 0);
 732     }
 733     }
 734     /* Update the count-of-usability, remove useless subpatterns,
 735         unshift s.  */
 736
 737     DEBUG_EXECUTE_r({
 738         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 739             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 740         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 741                           (s ? "Found" : "Did not find"),
 742             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 743                 ? "anchored" : "floating"),
 744             quoted,
 745             RE_SV_TAIL(check),
 746             (s ? " at offset " : "...\n") );
 747     });
 748
 749     if (!s)
 750         goto fail_finish;
 751     /* Finish the diagnostic message */
 752     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 753
 754     /* XXX dmq: first branch is for positive lookbehind...
 755        Our check string is offset from the beginning of the pattern.
 756        So we need to do any stclass tests offset forward from that
 757        point. I think. :-(
 758      */
 759
 760
 761
 762     check_at=s;
 763
 764
 765     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 766        Start with the other substr.
 767        XXXX no SCREAM optimization yet - and a very coarse implementation
 768        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 769                 *always* match.  Probably should be marked during compile...
 770        Probably it is right to do no SCREAM here...
 771      */
 772
 773     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 774                 : (prog->float_substr && prog->anchored_substr))
 775     {
 776         /* Take into account the "other" substring. */
 777         /* XXXX May be hopelessly wrong for UTF... */
 778         if (!other_last)
 779             other_last = strpos;
 780         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 781           do_other_anchored:
 782             {
 783                 char * const last = HOP3c(s, -start_shift, strbeg);
 784                 char *last1, *last2;
 785                 char * const saved_s = s;
 786                 SV* must;
 787
 788                 t = s - prog->check_offset_max;
 789                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 790                     && (!utf8_target
 791                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 792                             && t > strpos)))
 793                     NOOP;
 794                 else
 795                     t = strpos;
 796                 t = HOP3c(t, prog->anchored_offset, strend);
 797                 if (t < other_last)     /* These positions already checked */
 798                     t = other_last;
 799                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 800                 if (last < last1)
 801                     last1 = last;
 802                 /* XXXX It is not documented what units *_offsets are in.
 803                    We assume bytes, but this is clearly wrong.
 804                    Meaning this code needs to be carefully reviewed for errors.
 805                    dmq.
 806                   */
 807
 808                 /* On end-of-str: see comment below. */
 809                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 810                 if (must == &PL_sv_undef) {
 811                     s = (char*)NULL;
 812                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 813                 }
 814                 else
 815                     s = fbm_instr(
 816                         (unsigned char*)t,
 817                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 818                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 819                         must,
 820                         multiline ? FBMrf_MULTILINE : 0
 821                     );
 822                 DEBUG_EXECUTE_r({
 823                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 824                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 825                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 826                         (s ? "Found" : "Contradicts"),
 827                         quoted, RE_SV_TAIL(must));
 828                 });
 829
 830
 831                 if (!s) {
 832                     if (last1 >= last2) {
 833                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 834                                                 ", giving up...\n"));
 835                         goto fail_finish;
 836                     }
 837                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 838                         ", trying floating at offset %ld...\n",
 839                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 840                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 841                     s = HOP3c(last, 1, strend);
 842                     goto restart;
 843                 }
 844                 else {
 845                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 846                           (long)(s - i_strpos)));
 847                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 848                     other_last = HOP3c(s, 1, strend);
 849                     s = saved_s;
 850                     if (t == strpos)
 851                         goto try_at_start;
 852                     goto try_at_offset;
 853                 }
 854             }
 855         }
 856         else {          /* Take into account the floating substring. */
 857             char *last, *last1;
 858             char * const saved_s = s;
 859             SV* must;
 860
 861             t = HOP3c(s, -start_shift, strbeg);
 862             last1 = last =
 863                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 864             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 865                 last = HOP3c(t, prog->float_max_offset, strend);
 866             s = HOP3c(t, prog->float_min_offset, strend);
 867             if (s < other_last)
 868                 s = other_last;
 869  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 870             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 871             /* fbm_instr() takes into account exact value of end-of-str
 872                if the check is SvTAIL(ed).  Since false positives are OK,
 873                and end-of-str is not later than strend we are OK. */
 874             if (must == &PL_sv_undef) {
 875                 s = (char*)NULL;
 876                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 877             }
 878             else
 879                 s = fbm_instr((unsigned char*)s,
 880                               (unsigned char*)last + SvCUR(must)
 881                                   - (SvTAIL(must)!=0),
 882                               must, multiline ? FBMrf_MULTILINE : 0);
 883             DEBUG_EXECUTE_r({
 884                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 885                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 886                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 887                     (s ? "Found" : "Contradicts"),
 888                     quoted, RE_SV_TAIL(must));
 889             });
 890             if (!s) {
 891                 if (last1 == last) {
 892                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 893                                             ", giving up...\n"));
 894                     goto fail_finish;
 895                 }
 896                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 897                     ", trying anchored starting at offset %ld...\n",
 898                     (long)(saved_s + 1 - i_strpos)));
 899                 other_last = last;
 900                 s = HOP3c(t, 1, strend);
 901                 goto restart;
 902             }
 903             else {
 904                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 905                       (long)(s - i_strpos)));
 906                 other_last = s; /* Fix this later. --Hugo */
 907                 s = saved_s;
 908                 if (t == strpos)
 909                     goto try_at_start;
 910                 goto try_at_offset;
 911             }
 912         }
 913     }
 914
 915
 916     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 917
 918     DEBUG_OPTIMISE_MORE_r(
 919         PerlIO_printf(Perl_debug_log,
 920             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 921             (IV)prog->check_offset_min,
 922             (IV)prog->check_offset_max,
 923             (IV)(s-strpos),
 924             (IV)(t-strpos),
 925             (IV)(t-s),
 926             (IV)(strend-strpos)
 927         )
 928     );
 929
 930     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 931         && (!utf8_target
 932             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 933                  && t > strpos)))
 934     {
 935         /* Fixed substring is found far enough so that the match
 936            cannot start at strpos. */
 937       try_at_offset:
 938         if (ml_anch && t[-1] != '\n') {
 939             /* Eventually fbm_*() should handle this, but often
 940                anchored_offset is not 0, so this check will not be wasted. */
 941             /* XXXX In the code below we prefer to look for "^" even in
 942                presence of anchored substrings.  And we search even
 943                beyond the found float position.  These pessimizations
 944                are historical artefacts only.  */
 945           find_anchor:
 946             while (t < strend - prog->minlen) {
 947                 if (*t == '\n') {
 948                     if (t < check_at - prog->check_offset_min) {
 949                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 950                             /* Since we moved from the found position,
 951                                we definitely contradict the found anchored
 952                                substr.  Due to the above check we do not
 953                                contradict "check" substr.
 954                                Thus we can arrive here only if check substr
 955                                is float.  Redo checking for "other"=="fixed".
 956                              */
 957                             strpos = t + 1;
 958                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 959                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 960                             goto do_other_anchored;
 961                         }
 962                         /* We don't contradict the found floating substring. */
 963                         /* XXXX Why not check for STCLASS? */
 964                         s = t + 1;
 965                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 966                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 967                         goto set_useful;
 968                     }
 969                     /* Position contradicts check-string */
 970                     /* XXXX probably better to look for check-string
 971                        than for "\n", so one should lower the limit for t? */
 972                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 973                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 974                     other_last = strpos = s = t + 1;
 975                     goto restart;
 976                 }
 977                 t++;
 978             }
 979             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 980                         PL_colors[0], PL_colors[1]));
 981             goto fail_finish;
 982         }
 983         else {
 984             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 985                         PL_colors[0], PL_colors[1]));
 986         }
 987         s = t;
 988       set_useful:
 989         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 990     }
 991     else {
 992         /* The found string does not prohibit matching at strpos,
 993            - no optimization of calling REx engine can be performed,
 994            unless it was an MBOL and we are not after MBOL,
 995            or a future STCLASS check will fail this. */
 996       try_at_start:
 997         /* Even in this situation we may use MBOL flag if strpos is offset
 998            wrt the start of the string. */
 999         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1000             && (strpos != strbeg) && strpos[-1] != '\n'
1001             /* May be due to an implicit anchor of m{.*foo}  */
1002             && !(prog->intflags & PREGf_IMPLICIT))
1003         {
1004             t = strpos;
1005             goto find_anchor;
1006         }
1007         DEBUG_EXECUTE_r( if (ml_anch)
1008             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1009                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1010         );
1011       success_at_start:
1012         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1013             && (utf8_target ? (
1014                 prog->check_utf8                /* Could be deleted already */
1015                 && --BmUSEFUL(prog->check_utf8) < 0
1016                 && (prog->check_utf8 == prog->float_utf8)
1017             ) : (
1018                 prog->check_substr              /* Could be deleted already */
1019                 && --BmUSEFUL(prog->check_substr) < 0
1020                 && (prog->check_substr == prog->float_substr)
1021             )))
1022         {
1023             /* If flags & SOMETHING - do not do it many times on the same match */
1024             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1025             /* XXX Does the destruction order has to change with utf8_target? */
1026             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1027             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1028             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1029             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1030             check = NULL;                       /* abort */
1031             s = strpos;
1032             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag
1033                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1034             if (prog->intflags & PREGf_IMPLICIT)
1035                 prog->extflags &= ~RXf_ANCH_MBOL;
1036             /* XXXX This is a remnant of the old implementation.  It
1037                     looks wasteful, since now INTUIT can use many
1038                     other heuristics. */
1039             prog->extflags &= ~RXf_USE_INTUIT;
1040             /* XXXX What other flags might need to be cleared in this branch? */
1041         }
1042         else
1043             s = strpos;
1044     }
1045
1046     /* Last resort... */
1047     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1048     /* trie stclasses are too expensive to use here, we are better off to
1049        leave it to regmatch itself */
1050     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1051         /* minlen == 0 is possible if regstclass is \b or \B,
1052            and the fixed substr is ''$.
1053            Since minlen is already taken into account, s+1 is before strend;
1054            accidentally, minlen >= 1 guaranties no false positives at s + 1
1055            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1056            regstclass does not come from lookahead...  */
1057         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1058            This leaves EXACTF, EXACTFU only, which are dealt with in find_byclass().  */
1059         const U8* const str = (U8*)STRING(progi->regstclass);
1060         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1061                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1062                     : 1);
1063         char * endpos;
1064         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1065             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1066         else if (prog->float_substr || prog->float_utf8)
1067             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1068         else
1069             endpos= strend;
1070
1071         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1072                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1073
1074         t = s;
1075         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1076         if (!s) {
1077 #ifdef DEBUGGING
1078             const char *what = NULL;
1079 #endif
1080             if (endpos == strend) {
1081                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1082                                 "Could not match STCLASS...\n") );
1083                 goto fail;
1084             }
1085             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1086                                    "This position contradicts STCLASS...\n") );
1087             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1088                 goto fail;
1089             /* Contradict one of substrings */
1090             if (prog->anchored_substr || prog->anchored_utf8) {
1091                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1092                     DEBUG_EXECUTE_r( what = "anchored" );
1093                   hop_and_restart:
1094                     s = HOP3c(t, 1, strend);
1095                     if (s + start_shift + end_shift > strend) {
1096                         /* XXXX Should be taken into account earlier? */
1097                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1098                                                "Could not match STCLASS...\n") );
1099                         goto fail;
1100                     }
1101                     if (!check)
1102                         goto giveup;
1103                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1104                                 "Looking for %s substr starting at offset %ld...\n",
1105                                  what, (long)(s + start_shift - i_strpos)) );
1106                     goto restart;
1107                 }
1108                 /* Have both, check_string is floating */
1109                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1110                     goto retry_floating_check;
1111                 /* Recheck anchored substring, but not floating... */
1112                 s = check_at;
1113                 if (!check)
1114                     goto giveup;
1115                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1116                           "Looking for anchored substr starting at offset %ld...\n",
1117                           (long)(other_last - i_strpos)) );
1118                 goto do_other_anchored;
1119             }
1120             /* Another way we could have checked stclass at the
1121                current position only: */
1122             if (ml_anch) {
1123                 s = t = t + 1;
1124                 if (!check)
1125                     goto giveup;
1126                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1127                           "Looking for /%s^%s/m starting at offset %ld...\n",
1128                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1129                 goto try_at_offset;
1130             }
1131             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1132                 goto fail;
1133             /* Check is floating subtring. */
1134           retry_floating_check:
1135             t = check_at - start_shift;
1136             DEBUG_EXECUTE_r( what = "floating" );
1137             goto hop_and_restart;
1138         }
1139         if (t != s) {
1140             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1141                         "By STCLASS: moving %ld --> %ld\n",
1142                                   (long)(t - i_strpos), (long)(s - i_strpos))
1143                    );
1144         }
1145         else {
1146             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1147                                   "Does not contradict STCLASS...\n");
1148                    );
1149         }
1150     }
1151   giveup:
1152     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1153                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1154                           PL_colors[5], (long)(s - i_strpos)) );
1155     return s;
1156
1157   fail_finish:                          /* Substring not found */
1158     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1159         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1160   fail:
1161     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1162                           PL_colors[4], PL_colors[5]));
1163     return NULL;
1164 }
1165
1166 #define DECL_TRIE_TYPE(scan) \
1167     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1168                     trie_type = (scan->flags != EXACT) \
1169                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1170                               : (utf8_target ? trie_utf8 : trie_plain)
1171
1172 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1173 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1174     switch (trie_type) {                                                    \
1175     case trie_utf8_fold:                                                    \
1176         if ( foldlen>0 ) {                                                  \
1177             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1178             foldlen -= len;                                                 \
1179             uscan += len;                                                   \
1180             len=0;                                                          \
1181         } else {                                                            \
1182             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1183             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1184             foldlen -= UNISKIP( uvc );                                      \
1185             uscan = foldbuf + UNISKIP( uvc );                               \
1186         }                                                                   \
1187         break;                                                              \
1188     case trie_latin_utf8_fold:                                              \
1189         if ( foldlen>0 ) {                                                  \
1190             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1191             foldlen -= len;                                                 \
1192             uscan += len;                                                   \
1193             len=0;                                                          \
1194         } else {                                                            \
1195             len = 1;                                                        \
1196             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1197             foldlen -= UNISKIP( uvc );                                      \
1198             uscan = foldbuf + UNISKIP( uvc );                               \
1199         }                                                                   \
1200         break;                                                              \
1201     case trie_utf8:                                                         \
1202         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1203         break;                                                              \
1204     case trie_plain:                                                        \
1205         uvc = (UV)*uc;                                                      \
1206         len = 1;                                                            \
1207     }                                                                       \
1208     if (uvc < 256) {                                                        \
1209         charid = trie->charmap[ uvc ];                                      \
1210     }                                                                       \
1211     else {                                                                  \
1212         charid = 0;                                                         \
1213         if (widecharmap) {                                                  \
1214             SV** const svpp = hv_fetch(widecharmap,                         \
1215                         (char*)&uvc, sizeof(UV), 0);                        \
1216             if (svpp)                                                       \
1217                 charid = (U16)SvIV(*svpp);                                  \
1218         }                                                                   \
1219     }                                                                       \
1220 } STMT_END
1221
1222 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1223 {                                                      \
1224     char *my_strend= (char *)strend;                   \
1225     if ( (CoNd)                                        \
1226          && (ln == len ||                              \
1227              foldEQ_utf8(s, &my_strend, 0,  utf8_target,   \
1228                         m, NULL, ln, cBOOL(UTF_PATTERN)))      \
1229          && (!reginfo || regtry(reginfo, &s)) )        \
1230         goto got_it;                                   \
1231     else {                                             \
1232          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1233          uvchr_to_utf8(tmpbuf, c);                     \
1234          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1235          if ( f != c                                   \
1236               && (f == c1 || f == c2)                  \
1237               && (ln == len ||                         \
1238                 foldEQ_utf8(s, &my_strend, 0,  utf8_target,\
1239                               m, NULL, ln, cBOOL(UTF_PATTERN)))\
1240               && (!reginfo || regtry(reginfo, &s)) )   \
1241               goto got_it;                             \
1242     }                                                  \
1243 }                                                      \
1244 s += len
1245
1246 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1247 STMT_START {                                              \
1248     re_fold_t folder;                                   \
1249     switch (OP(c)) {                                      \
1250         case EXACTFU: folder = foldEQ_latin1; break;      \
1251         case EXACTFL: folder = foldEQ_locale; break;      \
1252         case EXACTF:  folder = foldEQ; break;             \
1253         default:                                          \
1254             Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \
1255     }                                                     \
1256     while (s <= e) {                                      \
1257         if ( (CoNd)                                       \
1258              && (ln == 1 || folder(s, m, ln))             \
1259              && (!reginfo || regtry(reginfo, &s)) )       \
1260             goto got_it;                                  \
1261         s++;                                              \
1262     }                                                     \
1263 } STMT_END
1264
1265 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1266 STMT_START {                                          \
1267     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1268         CoDe                                          \
1269         s += uskip;                                   \
1270     }                                                 \
1271 } STMT_END
1272
1273 #define REXEC_FBC_SCAN(CoDe)                          \
1274 STMT_START {                                          \
1275     while (s < strend) {                              \
1276         CoDe                                          \
1277         s++;                                          \
1278     }                                                 \
1279 } STMT_END
1280
1281 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1282 REXEC_FBC_UTF8_SCAN(                                  \
1283     if (CoNd) {                                       \
1284         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1285             goto got_it;                              \
1286         else                                          \
1287             tmp = doevery;                            \
1288     }                                                 \
1289     else                                              \
1290         tmp = 1;                                      \
1291 )
1292
1293 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1294 REXEC_FBC_SCAN(                                       \
1295     if (CoNd) {                                       \
1296         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1297             goto got_it;                              \
1298         else                                          \
1299             tmp = doevery;                            \
1300     }                                                 \
1301     else                                              \
1302         tmp = 1;                                      \
1303 )
1304
1305 #define REXEC_FBC_TRYIT               \
1306 if ((!reginfo || regtry(reginfo, &s))) \
1307     goto got_it
1308
1309 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1310     if (utf8_target) {                                             \
1311         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1312     }                                                          \
1313     else {                                                     \
1314         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1315     }                                                          \
1316     break
1317
1318 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1319     if (utf8_target) {                                             \
1320         UtFpReLoAd;                                            \
1321         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1322     }                                                          \
1323     else {                                                     \
1324         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1325     }                                                          \
1326     break
1327
1328 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1329     PL_reg_flags |= RF_tainted;                                \
1330     if (utf8_target) {                                             \
1331         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1332     }                                                          \
1333     else {                                                     \
1334         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1335     }                                                          \
1336     break
1337
1338 #define DUMP_EXEC_POS(li,s,doutf8) \
1339     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1340
1341 /* We know what class REx starts with.  Try to find this position... */
1342 /* if reginfo is NULL, its a dryrun */
1343 /* annoyingly all the vars in this routine have different names from their counterparts
1344    in regmatch. /grrr */
1345
1346 STATIC char *
1347 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1348     const char *strend, regmatch_info *reginfo)
1349 {
1350         dVAR;
1351         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1352         char *m;
1353         STRLEN ln;
1354         STRLEN lnc;
1355         register STRLEN uskip;
1356         unsigned int c1;
1357         unsigned int c2;
1358         char *e;
1359         register I32 tmp = 1;   /* Scratch variable? */
1360         register const bool utf8_target = PL_reg_match_utf8;
1361         RXi_GET_DECL(prog,progi);
1362
1363         PERL_ARGS_ASSERT_FIND_BYCLASS;
1364
1365         /* We know what class it must start with. */
1366         switch (OP(c)) {
1367         case ANYOF:
1368             if (utf8_target) {
1369                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
1370                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1371                           reginclass(prog, c, (U8*)s, 0, utf8_target) :
1372                           REGINCLASS(prog, c, (U8*)s));
1373             }
1374             else {
1375                  while (s < strend) {
1376                       STRLEN skip = 1;
1377
1378                       if (REGINCLASS(prog, c, (U8*)s) ||
1379                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1380                            /* The assignment of 2 is intentional:
1381                             * for the folded sharp s, the skip is 2. */
1382                            (skip = SHARP_S_SKIP))) {
1383                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1384                                 goto got_it;
1385                            else
1386                                 tmp = doevery;
1387                       }
1388                       else
1389                            tmp = 1;
1390                       s += skip;
1391                  }
1392             }
1393             break;
1394         case CANY:
1395             REXEC_FBC_SCAN(
1396                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1397                     goto got_it;
1398                 else
1399                     tmp = doevery;
1400             );
1401             break;
1402         case EXACTFU:
1403         case EXACTF:
1404             m   = STRING(c);
1405             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1406             lnc = (I32) ln;     /* length to match in characters */
1407             if (UTF_PATTERN) {
1408                 STRLEN ulen1, ulen2;
1409                 U8 *sm = (U8 *) m;
1410                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1411                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1412                 /* used by commented-out code below */
1413                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1414
1415                 /* XXX: Since the node will be case folded at compile
1416                    time this logic is a little odd, although im not
1417                    sure that its actually wrong. --dmq */
1418
1419                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1420                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1421
1422                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1423                    codepoint of the first character in the converted
1424                    form, yet originally we did the extra step.
1425                    No tests fail by commenting this code out however
1426                    so Ive left it out. -- dmq.
1427
1428                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1429                                     0, uniflags);
1430                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1431                                     0, uniflags);
1432                 */
1433
1434                 lnc = 0;
1435                 while (sm < ((U8 *) m + ln)) {
1436                     lnc++;
1437                     sm += UTF8SKIP(sm);
1438                 }
1439             }
1440             else {
1441                 c1 = *(U8*)m;
1442                 if (utf8_target || OP(c) == EXACTFU) {
1443
1444                     /* Micro sign folds to GREEK SMALL LETTER MU;
1445                        LATIN_SMALL_LETTER_SHARP_S folds to 'ss', and this sets
1446                        c2 to the first 's' of the pair, and the code below will
1447                        look for others */
1448                     c2 = (c1 == MICRO_SIGN)
1449                         ? GREEK_SMALL_LETTER_MU
1450                         : (c1 == LATIN_SMALL_LETTER_SHARP_S)
1451                            ? 's'
1452                            : PL_fold_latin1[c1];
1453                 } else c2 = PL_fold[c1];
1454             }
1455             goto do_exactf;
1456         case EXACTFL:
1457             m   = STRING(c);
1458             ln  = STR_LEN(c);
1459             lnc = (I32) ln;
1460             c1 = *(U8*)m;
1461             c2 = PL_fold_locale[c1];
1462           do_exactf:
1463             e = HOP3c(strend, -((I32)lnc), s);
1464
1465             if (!reginfo && e < s)
1466                 e = s;                  /* Due to minlen logic of intuit() */
1467
1468             /* The idea in the EXACTF* cases is to first find the
1469              * first character of the EXACTF* node and then, if
1470              * necessary, case-insensitively compare the full
1471              * text of the node.  The c1 and c2 are the first
1472              * characters (though in Unicode it gets a bit
1473              * more complicated because there are more cases
1474              * than just upper and lower: one needs to use
1475              * the so-called folding case for case-insensitive
1476              * matching (called "loose matching" in Unicode).
1477              * foldEQ_utf8() will do just that. */
1478
1479             if (utf8_target || UTF_PATTERN) {
1480                 UV c, f;
1481                 U8 tmpbuf [UTF8_MAXBYTES+1];
1482                 STRLEN len = 1;
1483                 STRLEN foldlen;
1484                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1485                 if (c1 == c2) {
1486                     /* Upper and lower of 1st char are equal -
1487                      * probably not a "letter". */
1488                     while (s <= e) {
1489                         if (utf8_target) {
1490                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1491                                            uniflags);
1492                         } else {
1493                             c = *((U8*)s);
1494                         }
1495                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1496                     }
1497                 }
1498                 else {
1499                     while (s <= e) {
1500                         if (utf8_target) {
1501                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1502                                            uniflags);
1503                         } else {
1504                             c = *((U8*)s);
1505                         }
1506
1507                         /* Handle some of the three Greek sigmas cases.
1508                          * Note that not all the possible combinations
1509                          * are handled here: some of them are handled
1510                          * by the standard folding rules, and some of
1511                          * them (the character class or ANYOF cases)
1512                          * are handled during compiletime in
1513                          * regexec.c:S_regclass(). */
1514                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1515                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1516                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1517
1518                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1519                     }
1520                 }
1521             }
1522             else {
1523                 /* Neither pattern nor string are UTF8 */
1524                 if (c1 == c2)
1525                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1526                 else
1527                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1528             }
1529             break;
1530         case BOUNDL:
1531             PL_reg_flags |= RF_tainted;
1532             /* FALL THROUGH */
1533         case BOUND:
1534             if (utf8_target) {
1535                 if (s == PL_bostr)
1536                     tmp = '\n';
1537                 else {
1538                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1539                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1540                 }
1541                 tmp = ((OP(c) == BOUND ?
1542                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1543                 LOAD_UTF8_CHARCLASS_ALNUM();
1544                 REXEC_FBC_UTF8_SCAN(
1545                     if (tmp == !(OP(c) == BOUND ?
1546                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1547                                  isALNUM_LC_utf8((U8*)s)))
1548                     {
1549                         tmp = !tmp;
1550                         REXEC_FBC_TRYIT;
1551                 }
1552                 );
1553             }
1554             else {  /* Not utf8 */
1555                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1556                 tmp = cBOOL((OP(c) == BOUNDL)
1557                             ? isALNUM_LC(tmp)
1558                             : (isWORDCHAR_L1(tmp)
1559                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1560                 REXEC_FBC_SCAN(
1561                     if (tmp ==
1562                         !((OP(c) == BOUNDL)
1563                           ? isALNUM_LC(*s)
1564                           : (isWORDCHAR_L1((U8) *s)
1565                              && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1566                     {
1567                         tmp = !tmp;
1568                         REXEC_FBC_TRYIT;
1569                 }
1570                 );
1571             }
1572             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1573                 goto got_it;
1574             break;
1575         case NBOUNDL:
1576             PL_reg_flags |= RF_tainted;
1577             /* FALL THROUGH */
1578         case NBOUND:
1579             if (utf8_target) {
1580                 if (s == PL_bostr)
1581                     tmp = '\n';
1582                 else {
1583                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1584                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1585                 }
1586                 tmp = ((OP(c) == NBOUND ?
1587                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1588                 LOAD_UTF8_CHARCLASS_ALNUM();
1589                 REXEC_FBC_UTF8_SCAN(
1590                     if (tmp == !(OP(c) == NBOUND ?
1591                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1592                                  isALNUM_LC_utf8((U8*)s)))
1593                         tmp = !tmp;
1594                     else REXEC_FBC_TRYIT;
1595                 );
1596             }
1597             else {
1598                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1599                 tmp = cBOOL((OP(c) == NBOUNDL)
1600                             ? isALNUM_LC(tmp)
1601                             : (isWORDCHAR_L1(tmp)
1602                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1603                 REXEC_FBC_SCAN(
1604                     if (tmp == ! cBOOL(
1605                             (OP(c) == NBOUNDL)
1606                             ? isALNUM_LC(*s)
1607                             : (isWORDCHAR_L1((U8) *s)
1608                                && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1609                     {
1610                         tmp = !tmp;
1611                     }
1612                     else REXEC_FBC_TRYIT;
1613                 );
1614             }
1615             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1616                 goto got_it;
1617             break;
1618         case ALNUM:
1619             REXEC_FBC_CSCAN_PRELOAD(
1620                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1621                 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1622                 (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
1623             );
1624         case ALNUML:
1625             REXEC_FBC_CSCAN_TAINT(
1626                 isALNUM_LC_utf8((U8*)s),
1627                 isALNUM_LC(*s)
1628             );
1629         case NALNUM:
1630             REXEC_FBC_CSCAN_PRELOAD(
1631                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1632                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1633                 ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
1634             );
1635         case NALNUML:
1636             REXEC_FBC_CSCAN_TAINT(
1637                 !isALNUM_LC_utf8((U8*)s),
1638                 !isALNUM_LC(*s)
1639             );
1640         case SPACE:
1641             REXEC_FBC_CSCAN_PRELOAD(
1642                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1643                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1644                 isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
1645             );
1646         case SPACEL:
1647             REXEC_FBC_CSCAN_TAINT(
1648                 isSPACE_LC_utf8((U8*)s),
1649                 isSPACE_LC(*s)
1650             );
1651         case NSPACE:
1652             REXEC_FBC_CSCAN_PRELOAD(
1653                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1654                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1655                 !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
1656             );
1657         case NSPACEL:
1658             REXEC_FBC_CSCAN_TAINT(
1659                 !isSPACE_LC_utf8((U8*)s),
1660                 !isSPACE_LC(*s)
1661             );
1662         case DIGIT:
1663             REXEC_FBC_CSCAN_PRELOAD(
1664                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1665                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1666                 isDIGIT(*s)
1667             );
1668         case DIGITL:
1669             REXEC_FBC_CSCAN_TAINT(
1670                 isDIGIT_LC_utf8((U8*)s),
1671                 isDIGIT_LC(*s)
1672             );
1673         case NDIGIT:
1674             REXEC_FBC_CSCAN_PRELOAD(
1675                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1676                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1677                 !isDIGIT(*s)
1678             );
1679         case NDIGITL:
1680             REXEC_FBC_CSCAN_TAINT(
1681                 !isDIGIT_LC_utf8((U8*)s),
1682                 !isDIGIT_LC(*s)
1683             );
1684         case LNBREAK:
1685             REXEC_FBC_CSCAN(
1686                 is_LNBREAK_utf8(s),
1687                 is_LNBREAK_latin1(s)
1688             );
1689         case VERTWS:
1690             REXEC_FBC_CSCAN(
1691                 is_VERTWS_utf8(s),
1692                 is_VERTWS_latin1(s)
1693             );
1694         case NVERTWS:
1695             REXEC_FBC_CSCAN(
1696                 !is_VERTWS_utf8(s),
1697                 !is_VERTWS_latin1(s)
1698             );
1699         case HORIZWS:
1700             REXEC_FBC_CSCAN(
1701                 is_HORIZWS_utf8(s),
1702                 is_HORIZWS_latin1(s)
1703             );
1704         case NHORIZWS:
1705             REXEC_FBC_CSCAN(
1706                 !is_HORIZWS_utf8(s),
1707                 !is_HORIZWS_latin1(s)
1708             );
1709         case AHOCORASICKC:
1710         case AHOCORASICK:
1711             {
1712                 DECL_TRIE_TYPE(c);
1713                 /* what trie are we using right now */
1714                 reg_ac_data *aho
1715                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1716                 reg_trie_data *trie
1717                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1718                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1719
1720                 const char *last_start = strend - trie->minlen;
1721 #ifdef DEBUGGING
1722                 const char *real_start = s;
1723 #endif
1724                 STRLEN maxlen = trie->maxlen;
1725                 SV *sv_points;
1726                 U8 **points; /* map of where we were in the input string
1727                                 when reading a given char. For ASCII this
1728                                 is unnecessary overhead as the relationship
1729                                 is always 1:1, but for Unicode, especially
1730                                 case folded Unicode this is not true. */
1731                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1732                 U8 *bitmap=NULL;
1733
1734
1735                 GET_RE_DEBUG_FLAGS_DECL;
1736
1737                 /* We can't just allocate points here. We need to wrap it in
1738                  * an SV so it gets freed properly if there is a croak while
1739                  * running the match */
1740                 ENTER;
1741                 SAVETMPS;
1742                 sv_points=newSV(maxlen * sizeof(U8 *));
1743                 SvCUR_set(sv_points,
1744                     maxlen * sizeof(U8 *));
1745                 SvPOK_on(sv_points);
1746                 sv_2mortal(sv_points);
1747                 points=(U8**)SvPV_nolen(sv_points );
1748                 if ( trie_type != trie_utf8_fold
1749                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1750                 {
1751                     if (trie->bitmap)
1752                         bitmap=(U8*)trie->bitmap;
1753                     else
1754                         bitmap=(U8*)ANYOF_BITMAP(c);
1755                 }
1756                 /* this is the Aho-Corasick algorithm modified a touch
1757                    to include special handling for long "unknown char"
1758                    sequences. The basic idea being that we use AC as long
1759                    as we are dealing with a possible matching char, when
1760                    we encounter an unknown char (and we have not encountered
1761                    an accepting state) we scan forward until we find a legal
1762                    starting char.
1763                    AC matching is basically that of trie matching, except
1764                    that when we encounter a failing transition, we fall back
1765                    to the current states "fail state", and try the current char
1766                    again, a process we repeat until we reach the root state,
1767                    state 1, or a legal transition. If we fail on the root state
1768                    then we can either terminate if we have reached an accepting
1769                    state previously, or restart the entire process from the beginning
1770                    if we have not.
1771
1772                  */
1773                 while (s <= last_start) {
1774                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1775                     U8 *uc = (U8*)s;
1776                     U16 charid = 0;
1777                     U32 base = 1;
1778                     U32 state = 1;
1779                     UV uvc = 0;
1780                     STRLEN len = 0;
1781                     STRLEN foldlen = 0;
1782                     U8 *uscan = (U8*)NULL;
1783                     U8 *leftmost = NULL;
1784 #ifdef DEBUGGING
1785                     U32 accepted_word= 0;
1786 #endif
1787                     U32 pointpos = 0;
1788
1789                     while ( state && uc <= (U8*)strend ) {
1790                         int failed=0;
1791                         U32 word = aho->states[ state ].wordnum;
1792
1793                         if( state==1 ) {
1794                             if ( bitmap ) {
1795                                 DEBUG_TRIE_EXECUTE_r(
1796                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1797                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1798                                             (char *)uc, utf8_target );
1799                                         PerlIO_printf( Perl_debug_log,
1800                                             " Scanning for legal start char...\n");
1801                                     }
1802                                 );
1803                                 if (utf8_target) {
1804                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1805                                         uc += UTF8SKIP(uc);
1806                                     }
1807                                 } else {
1808                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1809                                         uc++;
1810                                     }
1811                                 }
1812                                 s= (char *)uc;
1813                             }
1814                             if (uc >(U8*)last_start) break;
1815                         }
1816
1817                         if ( word ) {
1818                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1819                             if (!leftmost || lpos < leftmost) {
1820                                 DEBUG_r(accepted_word=word);
1821                                 leftmost= lpos;
1822                             }
1823                             if (base==0) break;
1824
1825                         }
1826                         points[pointpos++ % maxlen]= uc;
1827                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1828                                              uscan, len, uvc, charid, foldlen,
1829                                              foldbuf, uniflags);
1830                         DEBUG_TRIE_EXECUTE_r({
1831                             dump_exec_pos( (char *)uc, c, strend, real_start,
1832                                 s,   utf8_target );
1833                             PerlIO_printf(Perl_debug_log,
1834                                 " Charid:%3u CP:%4"UVxf" ",
1835                                  charid, uvc);
1836                         });
1837
1838                         do {
1839 #ifdef DEBUGGING
1840                             word = aho->states[ state ].wordnum;
1841 #endif
1842                             base = aho->states[ state ].trans.base;
1843
1844                             DEBUG_TRIE_EXECUTE_r({
1845                                 if (failed)
1846                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1847                                         s,   utf8_target );
1848                                 PerlIO_printf( Perl_debug_log,
1849                                     "%sState: %4"UVxf", word=%"UVxf,
1850                                     failed ? " Fail transition to " : "",
1851                                     (UV)state, (UV)word);
1852                             });
1853                             if ( base ) {
1854                                 U32 tmp;
1855                                 I32 offset;
1856                                 if (charid &&
1857                                      ( ((offset = base + charid
1858                                         - 1 - trie->uniquecharcount)) >= 0)
1859                                      && ((U32)offset < trie->lasttrans)
1860                                      && trie->trans[offset].check == state
1861                                      && (tmp=trie->trans[offset].next))
1862                                 {
1863                                     DEBUG_TRIE_EXECUTE_r(
1864                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1865                                     state = tmp;
1866                                     break;
1867                                 }
1868                                 else {
1869                                     DEBUG_TRIE_EXECUTE_r(
1870                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1871                                     failed = 1;
1872                                     state = aho->fail[state];
1873                                 }
1874                             }
1875                             else {
1876                                 /* we must be accepting here */
1877                                 DEBUG_TRIE_EXECUTE_r(
1878                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1879                                 failed = 1;
1880                                 break;
1881                             }
1882                         } while(state);
1883                         uc += len;
1884                         if (failed) {
1885                             if (leftmost)
1886                                 break;
1887                             if (!state) state = 1;
1888                         }
1889                     }
1890                     if ( aho->states[ state ].wordnum ) {
1891                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1892                         if (!leftmost || lpos < leftmost) {
1893                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1894                             leftmost = lpos;
1895                         }
1896                     }
1897                     if (leftmost) {
1898                         s = (char*)leftmost;
1899                         DEBUG_TRIE_EXECUTE_r({
1900                             PerlIO_printf(
1901                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1902                                 (UV)accepted_word, (IV)(s - real_start)
1903                             );
1904                         });
1905                         if (!reginfo || regtry(reginfo, &s)) {
1906                             FREETMPS;
1907                             LEAVE;
1908                             goto got_it;
1909                         }
1910                         s = HOPc(s,1);
1911                         DEBUG_TRIE_EXECUTE_r({
1912                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1913                         });
1914                     } else {
1915                         DEBUG_TRIE_EXECUTE_r(
1916                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1917                         break;
1918                     }
1919                 }
1920                 FREETMPS;
1921                 LEAVE;
1922             }
1923             break;
1924         default:
1925             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1926             break;
1927         }
1928         return 0;
1929       got_it:
1930         return s;
1931 }
1932
1933
1934 /*
1935  - regexec_flags - match a regexp against a string
1936  */
1937 I32
1938 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1939               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1940 /* strend: pointer to null at end of string */
1941 /* strbeg: real beginning of string */
1942 /* minend: end of match must be >=minend after stringarg. */
1943 /* data: May be used for some additional optimizations.
1944          Currently its only used, with a U32 cast, for transmitting
1945          the ganch offset when doing a /g match. This will change */
1946 /* nosave: For optimizations. */
1947 {
1948     dVAR;
1949     struct regexp *const prog = (struct regexp *)SvANY(rx);
1950     /*register*/ char *s;
1951     register regnode *c;
1952     /*register*/ char *startpos = stringarg;
1953     I32 minlen;         /* must match at least this many chars */
1954     I32 dontbother = 0; /* how many characters not to try at end */
1955     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1956     I32 scream_pos = -1;                /* Internal iterator of scream. */
1957     char *scream_olds = NULL;
1958     const bool utf8_target = cBOOL(DO_UTF8(sv));
1959     I32 multiline;
1960     RXi_GET_DECL(prog,progi);
1961     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1962     regexp_paren_pair *swap = NULL;
1963     GET_RE_DEBUG_FLAGS_DECL;
1964
1965     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1966     PERL_UNUSED_ARG(data);
1967
1968     /* Be paranoid... */
1969     if (prog == NULL || startpos == NULL) {
1970         Perl_croak(aTHX_ "NULL regexp parameter");
1971         return 0;
1972     }
1973
1974     multiline = prog->extflags & RXf_PMf_MULTILINE;
1975     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1976
1977     RX_MATCH_UTF8_set(rx, utf8_target);
1978     DEBUG_EXECUTE_r(
1979         debug_start_match(rx, utf8_target, startpos, strend,
1980         "Matching");
1981     );
1982
1983     minlen = prog->minlen;
1984
1985     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1986         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1987                               "String too short [regexec_flags]...\n"));
1988         goto phooey;
1989     }
1990
1991
1992     /* Check validity of program. */
1993     if (UCHARAT(progi->program) != REG_MAGIC) {
1994         Perl_croak(aTHX_ "corrupted regexp program");
1995     }
1996
1997     PL_reg_flags = 0;
1998     PL_reg_eval_set = 0;
1999     PL_reg_maxiter = 0;
2000
2001     if (RX_UTF8(rx))
2002         PL_reg_flags |= RF_utf8;
2003
2004     /* Mark beginning of line for ^ and lookbehind. */
2005     reginfo.bol = startpos; /* XXX not used ??? */
2006     PL_bostr  = strbeg;
2007     reginfo.sv = sv;
2008
2009     /* Mark end of line for $ (and such) */
2010     PL_regeol = strend;
2011
2012     /* see how far we have to get to not match where we matched before */
2013     reginfo.till = startpos+minend;
2014
2015     /* If there is a "must appear" string, look for it. */
2016     s = startpos;
2017
2018     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2019         MAGIC *mg;
2020         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2021             reginfo.ganch = startpos + prog->gofs;
2022             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2023               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2024         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2025                   && SvMAGIC(sv)
2026                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2027                   && mg->mg_len >= 0) {
2028             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2029             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2030                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2031
2032             if (prog->extflags & RXf_ANCH_GPOS) {
2033                 if (s > reginfo.ganch)
2034                     goto phooey;
2035                 s = reginfo.ganch - prog->gofs;
2036                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2037                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2038                 if (s < strbeg)
2039                     goto phooey;
2040             }
2041         }
2042         else if (data) {
2043             reginfo.ganch = strbeg + PTR2UV(data);
2044             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2045                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2046
2047         } else {                                /* pos() not defined */
2048             reginfo.ganch = strbeg;
2049             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2050                  "GPOS: reginfo.ganch = strbeg\n"));
2051         }
2052     }
2053     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2054         /* We have to be careful. If the previous successful match
2055            was from this regex we don't want a subsequent partially
2056            successful match to clobber the old results.
2057            So when we detect this possibility we add a swap buffer
2058            to the re, and switch the buffer each match. If we fail
2059            we switch it back, otherwise we leave it swapped.
2060         */
2061         swap = prog->offs;
2062         /* do we need a save destructor here for eval dies? */
2063         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2064     }
2065     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2066         re_scream_pos_data d;
2067
2068         d.scream_olds = &scream_olds;
2069         d.scream_pos = &scream_pos;
2070         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2071         if (!s) {
2072             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2073             goto phooey;        /* not present */
2074         }
2075     }
2076
2077
2078
2079     /* Simplest case:  anchored match need be tried only once. */
2080     /*  [unless only anchor is BOL and multiline is set] */
2081     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2082         if (s == startpos && regtry(&reginfo, &startpos))
2083             goto got_it;
2084         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2085                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2086         {
2087             char *end;
2088
2089             if (minlen)
2090                 dontbother = minlen - 1;
2091             end = HOP3c(strend, -dontbother, strbeg) - 1;
2092             /* for multiline we only have to try after newlines */
2093             if (prog->check_substr || prog->check_utf8) {
2094                 /* because of the goto we can not easily reuse the macros for bifurcating the
2095                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2096                 if (utf8_target) {
2097                     if (s == startpos)
2098                         goto after_try_utf8;
2099                     while (1) {
2100                         if (regtry(&reginfo, &s)) {
2101                             goto got_it;
2102                         }
2103                       after_try_utf8:
2104                         if (s > end) {
2105                             goto phooey;
2106                         }
2107                         if (prog->extflags & RXf_USE_INTUIT) {
2108                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2109                             if (!s) {
2110                                 goto phooey;
2111                             }
2112                         }
2113                         else {
2114                             s += UTF8SKIP(s);
2115                         }
2116                     }
2117                 } /* end search for check string in unicode */
2118                 else {
2119                     if (s == startpos) {
2120                         goto after_try_latin;
2121                     }
2122                     while (1) {
2123                         if (regtry(&reginfo, &s)) {
2124                             goto got_it;
2125                         }
2126                       after_try_latin:
2127                         if (s > end) {
2128                             goto phooey;
2129                         }
2130                         if (prog->extflags & RXf_USE_INTUIT) {
2131                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2132                             if (!s) {
2133                                 goto phooey;
2134                             }
2135                         }
2136                         else {
2137                             s++;
2138                         }
2139                     }
2140                 } /* end search for check string in latin*/
2141             } /* end search for check string */
2142             else { /* search for newline */
2143                 if (s > startpos) {
2144                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2145                     s--;
2146                 }
2147                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2148                 while (s < end) {
2149                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2150                         if (regtry(&reginfo, &s))
2151                             goto got_it;
2152                     }
2153                 }
2154             } /* end search for newline */
2155         } /* end anchored/multiline check string search */
2156         goto phooey;
2157     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2158     {
2159         /* the warning about reginfo.ganch being used without intialization
2160            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2161            and we only enter this block when the same bit is set. */
2162         char *tmp_s = reginfo.ganch - prog->gofs;
2163
2164         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2165             goto got_it;
2166         goto phooey;
2167     }
2168
2169     /* Messy cases:  unanchored match. */
2170     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2171         /* we have /x+whatever/ */
2172         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2173         char ch;
2174 #ifdef DEBUGGING
2175         int did_match = 0;
2176 #endif
2177         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2178             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2179         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2180
2181         if (utf8_target) {
2182             REXEC_FBC_SCAN(
2183                 if (*s == ch) {
2184                     DEBUG_EXECUTE_r( did_match = 1 );
2185                     if (regtry(&reginfo, &s)) goto got_it;
2186                     s += UTF8SKIP(s);
2187                     while (s < strend && *s == ch)
2188                         s += UTF8SKIP(s);
2189                 }
2190             );
2191         }
2192         else {
2193             REXEC_FBC_SCAN(
2194                 if (*s == ch) {
2195                     DEBUG_EXECUTE_r( did_match = 1 );
2196                     if (regtry(&reginfo, &s)) goto got_it;
2197                     s++;
2198                     while (s < strend && *s == ch)
2199                         s++;
2200                 }
2201             );
2202         }
2203         DEBUG_EXECUTE_r(if (!did_match)
2204                 PerlIO_printf(Perl_debug_log,
2205                                   "Did not find anchored character...\n")
2206                );
2207     }
2208     else if (prog->anchored_substr != NULL
2209               || prog->anchored_utf8 != NULL
2210               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2211                   && prog->float_max_offset < strend - s)) {
2212         SV *must;
2213         I32 back_max;
2214         I32 back_min;
2215         char *last;
2216         char *last1;            /* Last position checked before */
2217 #ifdef DEBUGGING
2218         int did_match = 0;
2219 #endif
2220         if (prog->anchored_substr || prog->anchored_utf8) {
2221             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2222                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2223             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2224             back_max = back_min = prog->anchored_offset;
2225         } else {
2226             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2227                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2228             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2229             back_max = prog->float_max_offset;
2230             back_min = prog->float_min_offset;
2231         }
2232
2233
2234         if (must == &PL_sv_undef)
2235             /* could not downgrade utf8 check substring, so must fail */
2236             goto phooey;
2237
2238         if (back_min<0) {
2239             last = strend;
2240         } else {
2241             last = HOP3c(strend,        /* Cannot start after this */
2242                   -(I32)(CHR_SVLEN(must)
2243                          - (SvTAIL(must) != 0) + back_min), strbeg);
2244         }
2245         if (s > PL_bostr)
2246             last1 = HOPc(s, -1);
2247         else
2248             last1 = s - 1;      /* bogus */
2249
2250         /* XXXX check_substr already used to find "s", can optimize if
2251            check_substr==must. */
2252         scream_pos = -1;
2253         dontbother = end_shift;
2254         strend = HOPc(strend, -dontbother);
2255         while ( (s <= last) &&
2256                 ((flags & REXEC_SCREAM)
2257                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2258                                     end_shift, &scream_pos, 0))
2259                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2260                                   (unsigned char*)strend, must,
2261                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2262             /* we may be pointing at the wrong string */
2263             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2264                 s = strbeg + (s - SvPVX_const(sv));
2265             DEBUG_EXECUTE_r( did_match = 1 );
2266             if (HOPc(s, -back_max) > last1) {
2267                 last1 = HOPc(s, -back_min);
2268                 s = HOPc(s, -back_max);
2269             }
2270             else {
2271                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2272
2273                 last1 = HOPc(s, -back_min);
2274                 s = t;
2275             }
2276             if (utf8_target) {
2277                 while (s <= last1) {
2278                     if (regtry(&reginfo, &s))
2279                         goto got_it;
2280                     s += UTF8SKIP(s);
2281                 }
2282             }
2283             else {
2284                 while (s <= last1) {
2285                     if (regtry(&reginfo, &s))
2286                         goto got_it;
2287                     s++;
2288                 }
2289             }
2290         }
2291         DEBUG_EXECUTE_r(if (!did_match) {
2292             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2293                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2294             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2295                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2296                                ? "anchored" : "floating"),
2297                 quoted, RE_SV_TAIL(must));
2298         });
2299         goto phooey;
2300     }
2301     else if ( (c = progi->regstclass) ) {
2302         if (minlen) {
2303             const OPCODE op = OP(progi->regstclass);
2304             /* don't bother with what can't match */
2305             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2306                 strend = HOPc(strend, -(minlen - 1));
2307         }
2308         DEBUG_EXECUTE_r({
2309             SV * const prop = sv_newmortal();
2310             regprop(prog, prop, c);
2311             {
2312                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2313                     s,strend-s,60);
2314                 PerlIO_printf(Perl_debug_log,
2315                     "Matching stclass %.*s against %s (%d bytes)\n",
2316                     (int)SvCUR(prop), SvPVX_const(prop),
2317                      quoted, (int)(strend - s));
2318             }
2319         });
2320         if (find_byclass(prog, c, s, strend, &reginfo))
2321             goto got_it;
2322         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2323     }
2324     else {
2325         dontbother = 0;
2326         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2327             /* Trim the end. */
2328             char *last;
2329             SV* float_real;
2330
2331             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2332                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2333             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2334
2335             if (flags & REXEC_SCREAM) {
2336                 last = screaminstr(sv, float_real, s - strbeg,
2337                                    end_shift, &scream_pos, 1); /* last one */
2338                 if (!last)
2339                     last = scream_olds; /* Only one occurrence. */
2340                 /* we may be pointing at the wrong string */
2341                 else if (RXp_MATCH_COPIED(prog))
2342                     s = strbeg + (s - SvPVX_const(sv));
2343             }
2344             else {
2345                 STRLEN len;
2346                 const char * const little = SvPV_const(float_real, len);
2347
2348                 if (SvTAIL(float_real)) {
2349                     if (memEQ(strend - len + 1, little, len - 1))
2350                         last = strend - len + 1;
2351                     else if (!multiline)
2352                         last = memEQ(strend - len, little, len)
2353                             ? strend - len : NULL;
2354                     else
2355                         goto find_last;
2356                 } else {
2357                   find_last:
2358                     if (len)
2359                         last = rninstr(s, strend, little, little + len);
2360                     else
2361                         last = strend;  /* matching "$" */
2362                 }
2363             }
2364             if (last == NULL) {
2365                 DEBUG_EXECUTE_r(
2366                     PerlIO_printf(Perl_debug_log,
2367                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2368                         PL_colors[4], PL_colors[5]));
2369                 goto phooey; /* Should not happen! */
2370             }
2371             dontbother = strend - last + prog->float_min_offset;
2372         }
2373         if (minlen && (dontbother < minlen))
2374             dontbother = minlen - 1;
2375         strend -= dontbother;              /* this one's always in bytes! */
2376         /* We don't know much -- general case. */
2377         if (utf8_target) {
2378             for (;;) {
2379                 if (regtry(&reginfo, &s))
2380                     goto got_it;
2381                 if (s >= strend)
2382                     break;
2383                 s += UTF8SKIP(s);
2384             };
2385         }
2386         else {
2387             do {
2388                 if (regtry(&reginfo, &s))
2389                     goto got_it;
2390             } while (s++ < strend);
2391         }
2392     }
2393
2394     /* Failure. */
2395     goto phooey;
2396
2397 got_it:
2398     Safefree(swap);
2399     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2400
2401     if (PL_reg_eval_set)
2402         restore_pos(aTHX_ prog);
2403     if (RXp_PAREN_NAMES(prog))
2404         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2405
2406     /* make sure $`, $&, $', and $digit will work later */
2407     if ( !(flags & REXEC_NOT_FIRST) ) {
2408         RX_MATCH_COPY_FREE(rx);
2409         if (flags & REXEC_COPY_STR) {
2410             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2411 #ifdef PERL_OLD_COPY_ON_WRITE
2412             if ((SvIsCOW(sv)
2413                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2414                 if (DEBUG_C_TEST) {
2415                     PerlIO_printf(Perl_debug_log,
2416                                   "Copy on write: regexp capture, type %d\n",
2417                                   (int) SvTYPE(sv));
2418                 }
2419                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2420                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2421                 assert (SvPOKp(prog->saved_copy));
2422             } else
2423 #endif
2424             {
2425                 RX_MATCH_COPIED_on(rx);
2426                 s = savepvn(strbeg, i);
2427                 prog->subbeg = s;
2428             }
2429             prog->sublen = i;
2430         }
2431         else {
2432             prog->subbeg = strbeg;
2433             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2434         }
2435     }
2436
2437     return 1;
2438
2439 phooey:
2440     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2441                           PL_colors[4], PL_colors[5]));
2442     if (PL_reg_eval_set)
2443         restore_pos(aTHX_ prog);
2444     if (swap) {
2445         /* we failed :-( roll it back */
2446         Safefree(prog->offs);
2447         prog->offs = swap;
2448     }
2449
2450     return 0;
2451 }
2452
2453
2454 /*
2455  - regtry - try match at specific point
2456  */
2457 STATIC I32                      /* 0 failure, 1 success */
2458 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2459 {
2460     dVAR;
2461     CHECKPOINT lastcp;
2462     REGEXP *const rx = reginfo->prog;
2463     regexp *const prog = (struct regexp *)SvANY(rx);
2464     RXi_GET_DECL(prog,progi);
2465     GET_RE_DEBUG_FLAGS_DECL;
2466
2467     PERL_ARGS_ASSERT_REGTRY;
2468
2469     reginfo->cutpoint=NULL;
2470
2471     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2472         MAGIC *mg;
2473
2474         PL_reg_eval_set = RS_init;
2475         DEBUG_EXECUTE_r(DEBUG_s(
2476             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2477                           (IV)(PL_stack_sp - PL_stack_base));
2478             ));
2479         SAVESTACK_CXPOS();
2480         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2481         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2482         SAVETMPS;
2483         /* Apparently this is not needed, judging by wantarray. */
2484         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2485            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2486
2487         if (reginfo->sv) {
2488             /* Make $_ available to executed code. */
2489             if (reginfo->sv != DEFSV) {
2490                 SAVE_DEFSV;
2491                 DEFSV_set(reginfo->sv);
2492             }
2493
2494             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2495                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2496                 /* prepare for quick setting of pos */
2497 #ifdef PERL_OLD_COPY_ON_WRITE
2498                 if (SvIsCOW(reginfo->sv))
2499                     sv_force_normal_flags(reginfo->sv, 0);
2500 #endif
2501                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2502                                  &PL_vtbl_mglob, NULL, 0);
2503                 mg->mg_len = -1;
2504             }
2505             PL_reg_magic    = mg;
2506             PL_reg_oldpos   = mg->mg_len;
2507             SAVEDESTRUCTOR_X(restore_pos, prog);
2508         }
2509         if (!PL_reg_curpm) {
2510             Newxz(PL_reg_curpm, 1, PMOP);
2511 #ifdef USE_ITHREADS
2512             {
2513                 SV* const repointer = &PL_sv_undef;
2514                 /* this regexp is also owned by the new PL_reg_curpm, which
2515                    will try to free it.  */
2516                 av_push(PL_regex_padav, repointer);
2517                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2518                 PL_regex_pad = AvARRAY(PL_regex_padav);
2519             }
2520 #endif
2521         }
2522 #ifdef USE_ITHREADS
2523         /* It seems that non-ithreads works both with and without this code.
2524            So for efficiency reasons it seems best not to have the code
2525            compiled when it is not needed.  */
2526         /* This is safe against NULLs: */
2527         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2528         /* PM_reg_curpm owns a reference to this regexp.  */
2529         ReREFCNT_inc(rx);
2530 #endif
2531         PM_SETRE(PL_reg_curpm, rx);
2532         PL_reg_oldcurpm = PL_curpm;
2533         PL_curpm = PL_reg_curpm;
2534         if (RXp_MATCH_COPIED(prog)) {
2535             /*  Here is a serious problem: we cannot rewrite subbeg,
2536                 since it may be needed if this match fails.  Thus
2537                 $` inside (?{}) could fail... */
2538             PL_reg_oldsaved = prog->subbeg;
2539             PL_reg_oldsavedlen = prog->sublen;
2540 #ifdef PERL_OLD_COPY_ON_WRITE
2541             PL_nrs = prog->saved_copy;
2542 #endif
2543             RXp_MATCH_COPIED_off(prog);
2544         }
2545         else
2546             PL_reg_oldsaved = NULL;
2547         prog->subbeg = PL_bostr;
2548         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2549     }
2550     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2551     prog->offs[0].start = *startpos - PL_bostr;
2552     PL_reginput = *startpos;
2553     PL_reglastparen = &prog->lastparen;
2554     PL_reglastcloseparen = &prog->lastcloseparen;
2555     prog->lastparen = 0;
2556     prog->lastcloseparen = 0;
2557     PL_regsize = 0;
2558     PL_regoffs = prog->offs;
2559     if (PL_reg_start_tmpl <= prog->nparens) {
2560         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2561         if(PL_reg_start_tmp)
2562             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2563         else
2564             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2565     }
2566
2567     /* XXXX What this code is doing here?!!!  There should be no need
2568        to do this again and again, PL_reglastparen should take care of
2569        this!  --ilya*/
2570
2571     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2572      * Actually, the code in regcppop() (which Ilya may be meaning by
2573      * PL_reglastparen), is not needed at all by the test suite
2574      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2575      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2576      * Meanwhile, this code *is* needed for the
2577      * above-mentioned test suite tests to succeed.  The common theme
2578      * on those tests seems to be returning null fields from matches.
2579      * --jhi updated by dapm */
2580 #if 1
2581     if (prog->nparens) {
2582         regexp_paren_pair *pp = PL_regoffs;
2583         register I32 i;
2584         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2585             ++pp;
2586             pp->start = -1;
2587             pp->end = -1;
2588         }
2589     }
2590 #endif
2591     REGCP_SET(lastcp);
2592     if (regmatch(reginfo, progi->program + 1)) {
2593         PL_regoffs[0].end = PL_reginput - PL_bostr;
2594         return 1;
2595     }
2596     if (reginfo->cutpoint)
2597         *startpos= reginfo->cutpoint;
2598     REGCP_UNWIND(lastcp);
2599     return 0;
2600 }
2601
2602
2603 #define sayYES goto yes
2604 #define sayNO goto no
2605 #define sayNO_SILENT goto no_silent
2606
2607 /* we dont use STMT_START/END here because it leads to
2608    "unreachable code" warnings, which are bogus, but distracting. */
2609 #define CACHEsayNO \
2610     if (ST.cache_mask) \
2611        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2612     sayNO
2613
2614 /* this is used to determine how far from the left messages like
2615    'failed...' are printed. It should be set such that messages
2616    are inline with the regop output that created them.
2617 */
2618 #define REPORT_CODE_OFF 32
2619
2620
2621 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2622 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2623
2624 #define SLAB_FIRST(s) (&(s)->states[0])
2625 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2626
2627 /* grab a new slab and return the first slot in it */
2628
2629 STATIC regmatch_state *
2630 S_push_slab(pTHX)
2631 {
2632 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2633     dMY_CXT;
2634 #endif
2635     regmatch_slab *s = PL_regmatch_slab->next;
2636     if (!s) {
2637         Newx(s, 1, regmatch_slab);
2638         s->prev = PL_regmatch_slab;
2639         s->next = NULL;
2640         PL_regmatch_slab->next = s;
2641     }
2642     PL_regmatch_slab = s;
2643     return SLAB_FIRST(s);
2644 }
2645
2646
2647 /* push a new state then goto it */
2648
2649 #define PUSH_STATE_GOTO(state, node) \
2650     scan = node; \
2651     st->resume_state = state; \
2652     goto push_state;
2653
2654 /* push a new state with success backtracking, then goto it */
2655
2656 #define PUSH_YES_STATE_GOTO(state, node) \
2657     scan = node; \
2658     st->resume_state = state; \
2659     goto push_yes_state;
2660
2661
2662
2663 /*
2664
2665 regmatch() - main matching routine
2666
2667 This is basically one big switch statement in a loop. We execute an op,
2668 set 'next' to point the next op, and continue. If we come to a point which
2669 we may need to backtrack to on failure such as (A|B|C), we push a
2670 backtrack state onto the backtrack stack. On failure, we pop the top
2671 state, and re-enter the loop at the state indicated. If there are no more
2672 states to pop, we return failure.
2673
2674 Sometimes we also need to backtrack on success; for example /A+/, where
2675 after successfully matching one A, we need to go back and try to
2676 match another one; similarly for lookahead assertions: if the assertion
2677 completes successfully, we backtrack to the state just before the assertion
2678 and then carry on.  In these cases, the pushed state is marked as
2679 'backtrack on success too'. This marking is in fact done by a chain of
2680 pointers, each pointing to the previous 'yes' state. On success, we pop to
2681 the nearest yes state, discarding any intermediate failure-only states.
2682 Sometimes a yes state is pushed just to force some cleanup code to be
2683 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2684 it to free the inner regex.
2685
2686 Note that failure backtracking rewinds the cursor position, while
2687 success backtracking leaves it alone.
2688
2689 A pattern is complete when the END op is executed, while a subpattern
2690 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2691 ops trigger the "pop to last yes state if any, otherwise return true"
2692 behaviour.
2693
2694 A common convention in this function is to use A and B to refer to the two
2695 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2696 the subpattern to be matched possibly multiple times, while B is the entire
2697 rest of the pattern. Variable and state names reflect this convention.
2698
2699 The states in the main switch are the union of ops and failure/success of
2700 substates associated with with that op.  For example, IFMATCH is the op
2701 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2702 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2703 successfully matched A and IFMATCH_A_fail is a state saying that we have
2704 just failed to match A. Resume states always come in pairs. The backtrack
2705 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2706 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2707 on success or failure.
2708
2709 The struct that holds a backtracking state is actually a big union, with
2710 one variant for each major type of op. The variable st points to the
2711 top-most backtrack struct. To make the code clearer, within each
2712 block of code we #define ST to alias the relevant union.
2713
2714 Here's a concrete example of a (vastly oversimplified) IFMATCH
2715 implementation:
2716
2717     switch (state) {
2718     ....
2719
2720 #define ST st->u.ifmatch
2721
2722     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2723         ST.foo = ...; // some state we wish to save
2724         ...
2725         // push a yes backtrack state with a resume value of
2726         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2727         // first node of A:
2728         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2729         // NOTREACHED
2730
2731     case IFMATCH_A: // we have successfully executed A; now continue with B
2732         next = B;
2733         bar = ST.foo; // do something with the preserved value
2734         break;
2735
2736     case IFMATCH_A_fail: // A failed, so the assertion failed
2737         ...;   // do some housekeeping, then ...
2738         sayNO; // propagate the failure
2739
2740 #undef ST
2741
2742     ...
2743     }
2744
2745 For any old-timers reading this who are familiar with the old recursive
2746 approach, the code above is equivalent to:
2747
2748     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2749     {
2750         int foo = ...
2751         ...
2752         if (regmatch(A)) {
2753             next = B;
2754             bar = foo;
2755             break;
2756         }
2757         ...;   // do some housekeeping, then ...
2758         sayNO; // propagate the failure
2759     }
2760
2761 The topmost backtrack state, pointed to by st, is usually free. If you
2762 want to claim it, populate any ST.foo fields in it with values you wish to
2763 save, then do one of
2764
2765         PUSH_STATE_GOTO(resume_state, node);
2766         PUSH_YES_STATE_GOTO(resume_state, node);
2767
2768 which sets that backtrack state's resume value to 'resume_state', pushes a
2769 new free entry to the top of the backtrack stack, then goes to 'node'.
2770 On backtracking, the free slot is popped, and the saved state becomes the
2771 new free state. An ST.foo field in this new top state can be temporarily
2772 accessed to retrieve values, but once the main loop is re-entered, it
2773 becomes available for reuse.
2774
2775 Note that the depth of the backtrack stack constantly increases during the
2776 left-to-right execution of the pattern, rather than going up and down with
2777 the pattern nesting. For example the stack is at its maximum at Z at the
2778 end of the pattern, rather than at X in the following:
2779
2780     /(((X)+)+)+....(Y)+....Z/
2781
2782 The only exceptions to this are lookahead/behind assertions and the cut,
2783 (?>A), which pop all the backtrack states associated with A before
2784 continuing.
2785
2786 Bascktrack state structs are allocated in slabs of about 4K in size.
2787 PL_regmatch_state and st always point to the currently active state,
2788 and PL_regmatch_slab points to the slab currently containing
2789 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2790 allocated, and is never freed until interpreter destruction. When the slab
2791 is full, a new one is allocated and chained to the end. At exit from
2792 regmatch(), slabs allocated since entry are freed.
2793
2794 */
2795
2796
2797 #define DEBUG_STATE_pp(pp)                                  \
2798     DEBUG_STATE_r({                                         \
2799         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2800         PerlIO_printf(Perl_debug_log,                       \
2801             "    %*s"pp" %s%s%s%s%s\n",                     \
2802             depth*2, "",                                    \
2803             PL_reg_name[st->resume_state],                     \
2804             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2805             ((st==yes_state) ? "Y" : ""),                   \
2806             ((st==mark_state) ? "M" : ""),                  \
2807             ((st==yes_state||st==mark_state) ? "]" : "")    \
2808         );                                                  \
2809     });
2810
2811
2812 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2813
2814 #ifdef DEBUGGING
2815
2816 STATIC void
2817 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2818     const char *start, const char *end, const char *blurb)
2819 {
2820     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2821
2822     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2823
2824     if (!PL_colorset)
2825             reginitcolors();
2826     {
2827         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2828             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2829
2830         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2831             start, end - start, 60);
2832
2833         PerlIO_printf(Perl_debug_log,
2834             "%s%s REx%s %s against %s\n",
2835                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2836
2837         if (utf8_target||utf8_pat)
2838             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2839                 utf8_pat ? "pattern" : "",
2840                 utf8_pat && utf8_target ? " and " : "",
2841                 utf8_target ? "string" : ""
2842             );
2843     }
2844 }
2845
2846 STATIC void
2847 S_dump_exec_pos(pTHX_ const char *locinput,
2848                       const regnode *scan,
2849                       const char *loc_regeol,
2850                       const char *loc_bostr,
2851                       const char *loc_reg_starttry,
2852                       const bool utf8_target)
2853 {
2854     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2855     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2856     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2857     /* The part of the string before starttry has one color
2858        (pref0_len chars), between starttry and current
2859        position another one (pref_len - pref0_len chars),
2860        after the current position the third one.
2861        We assume that pref0_len <= pref_len, otherwise we
2862        decrease pref0_len.  */
2863     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2864         ? (5 + taill) - l : locinput - loc_bostr;
2865     int pref0_len;
2866
2867     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2868
2869     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2870         pref_len++;
2871     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2872     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2873         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2874               ? (5 + taill) - pref_len : loc_regeol - locinput);
2875     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2876         l--;
2877     if (pref0_len < 0)
2878         pref0_len = 0;
2879     if (pref0_len > pref_len)
2880         pref0_len = pref_len;
2881     {
2882         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2883
2884         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2885             (locinput - pref_len),pref0_len, 60, 4, 5);
2886
2887         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2888                     (locinput - pref_len + pref0_len),
2889                     pref_len - pref0_len, 60, 2, 3);
2890
2891         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2892                     locinput, loc_regeol - locinput, 10, 0, 1);
2893
2894         const STRLEN tlen=len0+len1+len2;
2895         PerlIO_printf(Perl_debug_log,
2896                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2897                     (IV)(locinput - loc_bostr),
2898                     len0, s0,
2899                     len1, s1,
2900                     (docolor ? "" : "> <"),
2901                     len2, s2,
2902                     (int)(tlen > 19 ? 0 :  19 - tlen),
2903                     "");
2904     }
2905 }
2906
2907 #endif
2908
2909 /* reg_check_named_buff_matched()
2910  * Checks to see if a named buffer has matched. The data array of
2911  * buffer numbers corresponding to the buffer is expected to reside
2912  * in the regexp->data->data array in the slot stored in the ARG() of
2913  * node involved. Note that this routine doesn't actually care about the
2914  * name, that information is not preserved from compilation to execution.
2915  * Returns the index of the leftmost defined buffer with the given name
2916  * or 0 if non of the buffers matched.
2917  */
2918 STATIC I32
2919 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2920 {
2921     I32 n;
2922     RXi_GET_DECL(rex,rexi);
2923     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2924     I32 *nums=(I32*)SvPVX(sv_dat);
2925
2926     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2927
2928     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2929         if ((I32)*PL_reglastparen >= nums[n] &&
2930             PL_regoffs[nums[n]].end != -1)
2931         {
2932             return nums[n];
2933         }
2934     }
2935     return 0;
2936 }
2937
2938
2939 /* free all slabs above current one  - called during LEAVE_SCOPE */
2940
2941 STATIC void
2942 S_clear_backtrack_stack(pTHX_ void *p)
2943 {
2944     regmatch_slab *s = PL_regmatch_slab->next;
2945     PERL_UNUSED_ARG(p);
2946
2947     if (!s)
2948         return;
2949     PL_regmatch_slab->next = NULL;
2950     while (s) {
2951         regmatch_slab * const osl = s;
2952         s = s->next;
2953         Safefree(osl);
2954     }
2955 }
2956
2957
2958 #define SETREX(Re1,Re2) \
2959     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2960     Re1 = (Re2)
2961
2962 STATIC I32                      /* 0 failure, 1 success */
2963 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2964 {
2965 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2966     dMY_CXT;
2967 #endif
2968     dVAR;
2969     register const bool utf8_target = PL_reg_match_utf8;
2970     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2971     REGEXP *rex_sv = reginfo->prog;
2972     regexp *rex = (struct regexp *)SvANY(rex_sv);
2973     RXi_GET_DECL(rex,rexi);
2974     I32 oldsave;
2975     /* the current state. This is a cached copy of PL_regmatch_state */
2976     register regmatch_state *st;
2977     /* cache heavy used fields of st in registers */
2978     register regnode *scan;
2979     register regnode *next;
2980     register U32 n = 0; /* general value; init to avoid compiler warning */
2981     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2982     register char *locinput = PL_reginput;
2983     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2984
2985     bool result = 0;        /* return value of S_regmatch */
2986     int depth = 0;          /* depth of backtrack stack */
2987     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2988     const U32 max_nochange_depth =
2989         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2990         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2991     regmatch_state *yes_state = NULL; /* state to pop to on success of
2992                                                             subpattern */
2993     /* mark_state piggy backs on the yes_state logic so that when we unwind
2994        the stack on success we can update the mark_state as we go */
2995     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2996     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2997     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2998     U32 state_num;
2999     bool no_final = 0;      /* prevent failure from backtracking? */
3000     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3001     char *startpoint = PL_reginput;
3002     SV *popmark = NULL;     /* are we looking for a mark? */
3003     SV *sv_commit = NULL;   /* last mark name seen in failure */
3004     SV *sv_yes_mark = NULL; /* last mark name we have seen
3005                                during a successfull match */
3006     U32 lastopen = 0;       /* last open we saw */
3007     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3008     SV* const oreplsv = GvSV(PL_replgv);
3009     /* these three flags are set by various ops to signal information to
3010      * the very next op. They have a useful lifetime of exactly one loop
3011      * iteration, and are not preserved or restored by state pushes/pops
3012      */
3013     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3014     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3015     int logical = 0;        /* the following EVAL is:
3016                                 0: (?{...})
3017                                 1: (?(?{...})X|Y)
3018                                 2: (??{...})
3019                                or the following IFMATCH/UNLESSM is:
3020                                 false: plain (?=foo)
3021                                 true:  used as a condition: (?(?=foo))
3022                             */
3023 #ifdef DEBUGGING
3024     GET_RE_DEBUG_FLAGS_DECL;
3025 #endif
3026
3027     PERL_ARGS_ASSERT_REGMATCH;
3028
3029     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3030             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3031     }));
3032     /* on first ever call to regmatch, allocate first slab */
3033     if (!PL_regmatch_slab) {
3034         Newx(PL_regmatch_slab, 1, regmatch_slab);
3035         PL_regmatch_slab->prev = NULL;
3036         PL_regmatch_slab->next = NULL;
3037         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3038     }
3039
3040     oldsave = PL_savestack_ix;
3041     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3042     SAVEVPTR(PL_regmatch_slab);
3043     SAVEVPTR(PL_regmatch_state);
3044
3045     /* grab next free state slot */
3046     st = ++PL_regmatch_state;
3047     if (st >  SLAB_LAST(PL_regmatch_slab))
3048         st = PL_regmatch_state = S_push_slab(aTHX);
3049
3050     /* Note that nextchr is a byte even in UTF */
3051     nextchr = UCHARAT(locinput);
3052     scan = prog;
3053     while (scan != NULL) {
3054
3055         DEBUG_EXECUTE_r( {
3056             SV * const prop = sv_newmortal();
3057             regnode *rnext=regnext(scan);
3058             DUMP_EXEC_POS( locinput, scan, utf8_target );
3059             regprop(rex, prop, scan);
3060
3061             PerlIO_printf(Perl_debug_log,
3062                     "%3"IVdf":%*s%s(%"IVdf")\n",
3063                     (IV)(scan - rexi->program), depth*2, "",
3064                     SvPVX_const(prop),
3065                     (PL_regkind[OP(scan)] == END || !rnext) ?
3066                         0 : (IV)(rnext - rexi->program));
3067         });
3068
3069         next = scan + NEXT_OFF(scan);
3070         if (next == scan)
3071             next = NULL;
3072         state_num = OP(scan);
3073
3074       reenter_switch:
3075
3076         assert(PL_reglastparen == &rex->lastparen);
3077         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3078         assert(PL_regoffs == rex->offs);
3079
3080         switch (state_num) {
3081         case BOL:
3082             if (locinput == PL_bostr)
3083             {
3084                 /* reginfo->till = reginfo->bol; */
3085                 break;
3086             }
3087             sayNO;
3088         case MBOL:
3089             if (locinput == PL_bostr ||
3090                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3091             {
3092                 break;
3093             }
3094             sayNO;
3095         case SBOL:
3096             if (locinput == PL_bostr)
3097                 break;
3098             sayNO;
3099         case GPOS:
3100             if (locinput == reginfo->ganch)
3101                 break;
3102             sayNO;
3103
3104         case KEEPS:
3105             /* update the startpoint */
3106             st->u.keeper.val = PL_regoffs[0].start;
3107             PL_reginput = locinput;
3108             PL_regoffs[0].start = locinput - PL_bostr;
3109             PUSH_STATE_GOTO(KEEPS_next, next);
3110             /*NOT-REACHED*/
3111         case KEEPS_next_fail:
3112             /* rollback the start point change */
3113             PL_regoffs[0].start = st->u.keeper.val;
3114             sayNO_SILENT;
3115             /*NOT-REACHED*/
3116         case EOL:
3117                 goto seol;
3118         case MEOL:
3119             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3120                 sayNO;
3121             break;
3122         case SEOL:
3123           seol:
3124             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3125                 sayNO;
3126             if (PL_regeol - locinput > 1)
3127                 sayNO;
3128             break;
3129         case EOS:
3130             if (PL_regeol != locinput)
3131                 sayNO;
3132             break;
3133         case SANY:
3134             if (!nextchr && locinput >= PL_regeol)
3135                 sayNO;
3136             if (utf8_target) {
3137                 locinput += PL_utf8skip[nextchr];
3138                 if (locinput > PL_regeol)
3139                     sayNO;
3140                 nextchr = UCHARAT(locinput);
3141             }
3142             else
3143                 nextchr = UCHARAT(++locinput);
3144             break;
3145         case CANY:
3146             if (!nextchr && locinput >= PL_regeol)
3147                 sayNO;
3148             nextchr = UCHARAT(++locinput);
3149             break;
3150         case REG_ANY:
3151             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3152                 sayNO;
3153             if (utf8_target) {
3154                 locinput += PL_utf8skip[nextchr];
3155                 if (locinput > PL_regeol)
3156                     sayNO;
3157                 nextchr = UCHARAT(locinput);
3158             }
3159             else
3160                 nextchr = UCHARAT(++locinput);
3161             break;
3162
3163 #undef  ST
3164 #define ST st->u.trie
3165         case TRIEC:
3166             /* In this case the charclass data is available inline so
3167                we can fail fast without a lot of extra overhead.
3168              */
3169             if (scan->flags == EXACT || !utf8_target) {
3170                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3171                     DEBUG_EXECUTE_r(
3172                         PerlIO_printf(Perl_debug_log,
3173                                   "%*s  %sfailed to match trie start class...%s\n",
3174                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3175                     );
3176                     sayNO_SILENT;
3177                     /* NOTREACHED */
3178                 }
3179             }
3180             /* FALL THROUGH */
3181         case TRIE:
3182             /* the basic plan of execution of the trie is:
3183              * At the beginning, run though all the states, and
3184              * find the longest-matching word. Also remember the position
3185              * of the shortest matching word. For example, this pattern:
3186              *    1  2 3 4    5
3187              *    ab|a|x|abcd|abc
3188              * when matched against the string "abcde", will generate
3189              * accept states for all words except 3, with the longest
3190              * matching word being 4, and the shortest being 1 (with
3191              * the position being after char 1 of the string).
3192              *
3193              * Then for each matching word, in word order (i.e. 1,2,4,5),
3194              * we run the remainder of the pattern; on each try setting
3195              * the current position to the character following the word,
3196              * returning to try the next word on failure.
3197              *
3198              * We avoid having to build a list of words at runtime by
3199              * using a compile-time structure, wordinfo[].prev, which
3200              * gives, for each word, the previous accepting word (if any).
3201              * In the case above it would contain the mappings 1->2, 2->0,
3202              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3203              * the longest word (4 above), a list of all words, by
3204              * following the list of prev pointers; this gives us the
3205              * unordered list 4,5,1,2. Then given the current word we have
3206              * just tried, we can go through the list and find the
3207              * next-biggest word to try (so if we just failed on word 2,
3208              * the next in the list is 4).
3209              *
3210              * Since at runtime we don't record the matching position in
3211              * the string for each word, we have to work that out for
3212              * each word we're about to process. The wordinfo table holds
3213              * the character length of each word; given that we recorded
3214              * at the start: the position of the shortest word and its
3215              * length in chars, we just need to move the pointer the
3216              * difference between the two char lengths. Depending on
3217              * Unicode status and folding, that's cheap or expensive.
3218              *
3219              * This algorithm is optimised for the case where are only a
3220              * small number of accept states, i.e. 0,1, or maybe 2.
3221              * With lots of accepts states, and having to try all of them,
3222              * it becomes quadratic on number of accept states to find all
3223              * the next words.
3224              */
3225
3226             {
3227                 /* what type of TRIE am I? (utf8 makes this contextual) */
3228                 DECL_TRIE_TYPE(scan);
3229
3230                 /* what trie are we using right now */
3231                 reg_trie_data * const trie
3232                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3233                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3234                 U32 state = trie->startstate;
3235
3236                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3237                     !TRIE_BITMAP_TEST(trie,*locinput)
3238                 ) {
3239                     if (trie->states[ state ].wordnum) {
3240                          DEBUG_EXECUTE_r(
3241                             PerlIO_printf(Perl_debug_log,
3242                                           "%*s  %smatched empty string...%s\n",
3243                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3244                         );
3245                         if (!trie->jump)
3246                             break;
3247                     } else {
3248                         DEBUG_EXECUTE_r(
3249                             PerlIO_printf(Perl_debug_log,
3250                                           "%*s  %sfailed to match trie start class...%s\n",
3251                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3252                         );
3253                         sayNO_SILENT;
3254                    }
3255                 }
3256
3257             {
3258                 U8 *uc = ( U8* )locinput;
3259
3260                 STRLEN len = 0;
3261                 STRLEN foldlen = 0;
3262                 U8 *uscan = (U8*)NULL;
3263                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3264                 U32 charcount = 0; /* how many input chars we have matched */
3265                 U32 accepted = 0; /* have we seen any accepting states? */
3266
3267                 ST.B = next;
3268                 ST.jump = trie->jump;
3269                 ST.me = scan;
3270                 ST.firstpos = NULL;
3271                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3272                 ST.nextword = 0;
3273
3274                 /* fully traverse the TRIE; note the position of the
3275                    shortest accept state and the wordnum of the longest
3276                    accept state */
3277
3278                 while ( state && uc <= (U8*)PL_regeol ) {
3279                     U32 base = trie->states[ state ].trans.base;
3280                     UV uvc = 0;
3281                     U16 charid = 0;
3282                     U16 wordnum;
3283                     wordnum = trie->states[ state ].wordnum;
3284
3285                     if (wordnum) { /* it's an accept state */
3286                         if (!accepted) {
3287                             accepted = 1;
3288                             /* record first match position */
3289                             if (ST.longfold) {
3290                                 ST.firstpos = (U8*)locinput;
3291                                 ST.firstchars = 0;
3292                             }
3293                             else {
3294                                 ST.firstpos = uc;
3295                                 ST.firstchars = charcount;
3296                             }
3297                         }
3298                         if (!ST.nextword || wordnum < ST.nextword)
3299                             ST.nextword = wordnum;
3300                         ST.topword = wordnum;
3301                     }
3302
3303                     DEBUG_TRIE_EXECUTE_r({
3304                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3305                                 PerlIO_printf( Perl_debug_log,
3306                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3307                                     2+depth * 2, "", PL_colors[4],
3308                                     (UV)state, (accepted ? 'Y' : 'N'));
3309                     });
3310
3311                     /* read a char and goto next state */
3312                     if ( base ) {
3313                         I32 offset;
3314                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3315                                              uscan, len, uvc, charid, foldlen,
3316                                              foldbuf, uniflags);
3317                         charcount++;
3318                         if (foldlen>0)
3319                             ST.longfold = TRUE;
3320                         if (charid &&
3321                              ( ((offset =
3322                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3323
3324                              && ((U32)offset < trie->lasttrans)
3325                              && trie->trans[offset].check == state)
3326                         {
3327                             state = trie->trans[offset].next;
3328                         }
3329                         else {
3330                             state = 0;
3331                         }
3332                         uc += len;
3333
3334                     }
3335                     else {
3336                         state = 0;
3337                     }
3338                     DEBUG_TRIE_EXECUTE_r(
3339                         PerlIO_printf( Perl_debug_log,
3340                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3341                             charid, uvc, (UV)state, PL_colors[5] );
3342                     );
3343                 }
3344                 if (!accepted)
3345                    sayNO;
3346
3347                 /* calculate total number of accept states */
3348                 {
3349                     U16 w = ST.topword;
3350                     accepted = 0;
3351                     while (w) {
3352                         w = trie->wordinfo[w].prev;
3353                         accepted++;
3354                     }
3355                     ST.accepted = accepted;
3356                 }
3357
3358                 DEBUG_EXECUTE_r(
3359                     PerlIO_printf( Perl_debug_log,
3360                         "%*s  %sgot %"IVdf" possible matches%s\n",
3361                         REPORT_CODE_OFF + depth * 2, "",
3362                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3363                 );
3364                 goto trie_first_try; /* jump into the fail handler */
3365             }}
3366             /* NOTREACHED */
3367
3368         case TRIE_next_fail: /* we failed - try next alternative */
3369             if ( ST.jump) {
3370                 REGCP_UNWIND(ST.cp);
3371                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3372                     PL_regoffs[n].end = -1;
3373                 *PL_reglastparen = n;
3374             }
3375             if (!--ST.accepted) {
3376                 DEBUG_EXECUTE_r({
3377                     PerlIO_printf( Perl_debug_log,
3378                         "%*s  %sTRIE failed...%s\n",
3379                         REPORT_CODE_OFF+depth*2, "",
3380                         PL_colors[4],
3381                         PL_colors[5] );
3382                 });
3383                 sayNO_SILENT;
3384             }
3385             {
3386                 /* Find next-highest word to process.  Note that this code
3387                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3388                 register U16 min = 0;
3389                 register U16 word;
3390                 register U16 const nextword = ST.nextword;
3391                 register reg_trie_wordinfo * const wordinfo
3392                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3393                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3394                     if (word > nextword && (!min || word < min))
3395                         min = word;
3396                 }
3397                 ST.nextword = min;
3398             }
3399
3400           trie_first_try:
3401             if (do_cutgroup) {
3402                 do_cutgroup = 0;
3403                 no_final = 0;
3404             }
3405
3406             if ( ST.jump) {
3407                 ST.lastparen = *PL_reglastparen;
3408                 REGCP_SET(ST.cp);
3409             }
3410
3411             /* find start char of end of current word */
3412             {
3413                 U32 chars; /* how many chars to skip */
3414                 U8 *uc = ST.firstpos;
3415                 reg_trie_data * const trie
3416                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3417
3418                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3419                             >=  ST.firstchars);
3420                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3421                             - ST.firstchars;
3422
3423                 if (ST.longfold) {
3424                     /* the hard option - fold each char in turn and find
3425                      * its folded length (which may be different */
3426                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3427                     STRLEN foldlen;
3428                     STRLEN len;
3429                     UV uvc;
3430                     U8 *uscan;
3431
3432                     while (chars) {
3433                         if (utf8_target) {
3434                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3435                                                     uniflags);
3436                             uc += len;
3437                         }
3438                         else {
3439                             uvc = *uc;
3440                             uc++;
3441                         }
3442                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3443                         uscan = foldbuf;
3444                         while (foldlen) {
3445                             if (!--chars)
3446                                 break;
3447                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3448                                             uniflags);
3449                             uscan += len;
3450                             foldlen -= len;
3451                         }
3452                     }
3453                 }
3454                 else {
3455                     if (utf8_target)
3456                         while (chars--)
3457                             uc += UTF8SKIP(uc);
3458                     else
3459                         uc += chars;
3460                 }
3461                 PL_reginput = (char *)uc;
3462             }
3463
3464             scan = (ST.jump && ST.jump[ST.nextword])
3465                         ? ST.me + ST.jump[ST.nextword]
3466                         : ST.B;
3467
3468             DEBUG_EXECUTE_r({
3469                 PerlIO_printf( Perl_debug_log,
3470                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3471                     REPORT_CODE_OFF+depth*2, "",
3472                     PL_colors[4],
3473                     ST.nextword,
3474                     PL_colors[5]
3475                     );
3476             });
3477
3478             if (ST.accepted > 1 || has_cutgroup) {
3479                 PUSH_STATE_GOTO(TRIE_next, scan);
3480                 /* NOTREACHED */
3481             }
3482             /* only one choice left - just continue */
3483             DEBUG_EXECUTE_r({
3484                 AV *const trie_words
3485                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3486                 SV ** const tmp = av_fetch( trie_words,
3487                     ST.nextword-1, 0 );
3488                 SV *sv= tmp ? sv_newmortal() : NULL;
3489
3490                 PerlIO_printf( Perl_debug_log,
3491                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3492                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3493                     ST.nextword,
3494                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3495                             PL_colors[0], PL_colors[1],
3496                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3497                         )
3498                     : "not compiled under -Dr",
3499                     PL_colors[5] );
3500             });
3501
3502             locinput = PL_reginput;
3503             nextchr = UCHARAT(locinput);
3504             continue; /* execute rest of RE */
3505             /* NOTREACHED */
3506 #undef  ST
3507
3508         case EXACT: {
3509             char *s = STRING(scan);
3510             ln = STR_LEN(scan);
3511             if (utf8_target != UTF_PATTERN) {
3512                 /* The target and the pattern have differing utf8ness. */
3513                 char *l = locinput;
3514                 const char * const e = s + ln;
3515
3516                 if (utf8_target) {
3517                     /* The target is utf8, the pattern is not utf8. */
3518                     while (s < e) {
3519                         STRLEN ulen;
3520                         if (l >= PL_regeol)
3521                              sayNO;
3522                         if (NATIVE_TO_UNI(*(U8*)s) !=
3523                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3524                                             uniflags))
3525                              sayNO;
3526                         l += ulen;
3527                         s ++;
3528                     }
3529                 }
3530                 else {
3531                     /* The target is not utf8, the pattern is utf8. */
3532                     while (s < e) {
3533                         STRLEN ulen;
3534                         if (l >= PL_regeol)
3535                             sayNO;
3536                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3537                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3538                                            uniflags))
3539                             sayNO;
3540                         s += ulen;
3541                         l ++;
3542                     }
3543                 }
3544                 locinput = l;
3545                 nextchr = UCHARAT(locinput);
3546                 break;
3547             }
3548             /* The target and the pattern have the same utf8ness. */
3549             /* Inline the first character, for speed. */
3550             if (UCHARAT(s) != nextchr)
3551                 sayNO;
3552             if (PL_regeol - locinput < ln)
3553                 sayNO;
3554             if (ln > 1 && memNE(s, locinput, ln))
3555                 sayNO;
3556             locinput += ln;
3557             nextchr = UCHARAT(locinput);
3558             break;
3559             }
3560         case EXACTFL: {
3561             re_fold_t folder;
3562             const U8 * fold_array;
3563             const char * s;
3564
3565             PL_reg_flags |= RF_tainted;
3566             folder = foldEQ_locale;
3567             fold_array = PL_fold_locale;
3568             goto do_exactf;
3569
3570         case EXACTFU:
3571             folder = foldEQ_latin1;
3572             fold_array = PL_fold_latin1;
3573             goto do_exactf;
3574
3575         case EXACTF:
3576             folder = foldEQ;
3577             fold_array = PL_fold;
3578
3579           do_exactf:
3580             s = STRING(scan);
3581             ln = STR_LEN(scan);
3582
3583             if (utf8_target || UTF_PATTERN) {
3584               /* Either target or the pattern are utf8. */
3585                 const char * const l = locinput;
3586                 char *e = PL_regeol;
3587
3588                 if (! foldEQ_utf8(s, 0,  ln, cBOOL(UTF_PATTERN),
3589                                l, &e, 0,  utf8_target)) {
3590                      /* One more case for the sharp s:
3591                       * pack("U0U*", 0xDF) =~ /ss/i,
3592                       * the 0xC3 0x9F are the UTF-8
3593                       * byte sequence for the U+00DF. */
3594
3595                      if (!(utf8_target &&
3596                            toLOWER(s[0]) == 's' &&
3597                            ln >= 2 &&
3598                            toLOWER(s[1]) == 's' &&
3599                            (U8)l[0] == 0xC3 &&
3600                            e - l >= 2 &&
3601                            (U8)l[1] == 0x9F))
3602                           sayNO;
3603                 }
3604                 locinput = e;
3605                 nextchr = UCHARAT(locinput);
3606                 break;
3607             }
3608
3609             /* Neither the target and the pattern are utf8. */
3610
3611             /* Inline the first character, for speed. */
3612             if (UCHARAT(s) != nextchr &&
3613                 UCHARAT(s) != fold_array[nextchr])
3614             {
3615                 sayNO;
3616             }
3617             if (PL_regeol - locinput < ln)
3618                 sayNO;
3619             if (ln > 1 && ! folder(s, locinput, ln))
3620                 sayNO;
3621             locinput += ln;
3622             nextchr = UCHARAT(locinput);
3623             break;
3624         }
3625         case BOUNDL:
3626         case NBOUNDL:
3627             PL_reg_flags |= RF_tainted;
3628             /* FALL THROUGH */
3629         case BOUND:
3630         case NBOUND:
3631             /* was last char in word? */
3632             if (utf8_target) {
3633                 if (locinput == PL_bostr)
3634                     ln = '\n';
3635                 else {
3636                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3637
3638                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3639                 }
3640                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3641                     ln = isALNUM_uni(ln);
3642                     LOAD_UTF8_CHARCLASS_ALNUM();
3643                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3644                 }
3645                 else {
3646                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3647                     n = isALNUM_LC_utf8((U8*)locinput);
3648                 }
3649             }
3650             else {
3651                 ln = (locinput != PL_bostr) ?
3652                     UCHARAT(locinput - 1) : '\n';
3653                 if (FLAGS(scan) & USE_UNI) {
3654
3655                     /* Here, can't be BOUNDL or NBOUNDL because they never set
3656                      * the flags to USE_UNI */
3657                     ln = isWORDCHAR_L1(ln);
3658                     n = isWORDCHAR_L1(nextchr);
3659                 }
3660                 else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3661                     ln = isALNUM(ln);
3662                     n = isALNUM(nextchr);
3663                 }
3664                 else {
3665                     ln = isALNUM_LC(ln);
3666                     n = isALNUM_LC(nextchr);
3667                 }
3668             }
3669             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3670                                     OP(scan) == BOUNDL))
3671                     sayNO;
3672             break;
3673         case ANYOF:
3674             if (utf8_target) {
3675                 STRLEN inclasslen = PL_regeol - locinput;
3676                 if (locinput >= PL_regeol)
3677                     sayNO;
3678
3679                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3680                     goto anyof_fail;
3681                 locinput += inclasslen;
3682                 nextchr = UCHARAT(locinput);
3683                 break;
3684             }
3685             else {
3686                 if (nextchr < 0)
3687                     nextchr = UCHARAT(locinput);
3688                 if (!nextchr && locinput >= PL_regeol)
3689                     sayNO;
3690                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3691                     goto anyof_fail;
3692                 nextchr = UCHARAT(++locinput);
3693                 break;
3694             }
3695         anyof_fail:
3696             /* If we might have the case of the German sharp s
3697              * in a casefolding Unicode character class. */
3698
3699             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3700                  locinput += SHARP_S_SKIP;
3701                  nextchr = UCHARAT(locinput);
3702             }
3703             else
3704                  sayNO;
3705             break;
3706         /* Special char classes - The defines start on line 129 or so */
3707         CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3708         CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3709
3710         CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3711         CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3712
3713         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3714         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3715
3716         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3717                        a Unicode extended Grapheme Cluster */
3718             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3719               extended Grapheme Cluster is:
3720
3721                CR LF
3722                | Prepend* Begin Extend*
3723                | .
3724
3725                Begin is (Hangul-syllable | ! Control)
3726                Extend is (Grapheme_Extend | Spacing_Mark)
3727                Control is [ GCB_Control CR LF ]
3728
3729                The discussion below shows how the code for CLUMP is derived
3730                from this regex.  Note that most of these concepts are from
3731                property values of the Grapheme Cluster Boundary (GCB) property.
3732                No code point can have multiple property values for a given
3733                property.  Thus a code point in Prepend can't be in Control, but
3734                it must be in !Control.  This is why Control above includes
3735                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3736                property separately, and so can't be in GCB_Control, even though
3737                they logically are controls.  Control is not the same as gc=cc,
3738                but includes format and other characters as well.
3739
3740                The Unicode definition of Hangul-syllable is:
3741                    L+
3742                    | (L* ( ( V | LV ) V* | LVT ) T*)
3743                    | T+
3744                   )
3745                Each of these is a value for the GCB property, and hence must be
3746                disjoint, so the order they are tested is immaterial, so the
3747                above can safely be changed to
3748                    T+
3749                    | L+
3750                    | (L* ( LVT | ( V | LV ) V*) T*)
3751
3752                The last two terms can be combined like this:
3753                    L* ( L
3754                         | (( LVT | ( V | LV ) V*) T*))
3755
3756                And refactored into this:
3757                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3758
3759                That means that if we have seen any L's at all we can quit
3760                there, but if the next character is a LVT, a V or and LV we
3761                should keep going.
3762
3763                There is a subtlety with Prepend* which showed up in testing.
3764                Note that the Begin, and only the Begin is required in:
3765                 | Prepend* Begin Extend*
3766                Also, Begin contains '! Control'.  A Prepend must be a '!
3767                Control', which means it must be a Begin.  What it comes down to
3768                is that if we match Prepend* and then find no suitable Begin
3769                afterwards, that if we backtrack the last Prepend, that one will
3770                be a suitable Begin.
3771             */
3772
3773             if (locinput >= PL_regeol)
3774                 sayNO;
3775             if  (! utf8_target) {
3776
3777                 /* Match either CR LF  or '.', as all the other possibilities
3778                  * require utf8 */
3779                 locinput++;         /* Match the . or CR */
3780                 if (nextchr == '\r'
3781                     && locinput < PL_regeol
3782                     && UCHARAT(locinput) == '\n') locinput++;
3783             }
3784             else {
3785
3786                 /* Utf8: See if is ( CR LF ); already know that locinput <
3787                  * PL_regeol, so locinput+1 is in bounds */
3788                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3789                     locinput += 2;
3790                 }
3791                 else {
3792                     /* In case have to backtrack to beginning, then match '.' */
3793                     char *starting = locinput;
3794
3795                     /* In case have to backtrack the last prepend */
3796                     char *previous_prepend = 0;
3797
3798                     LOAD_UTF8_CHARCLASS_GCB();
3799
3800                     /* Match (prepend)* */
3801                     while (locinput < PL_regeol
3802                            && swash_fetch(PL_utf8_X_prepend,
3803                                           (U8*)locinput, utf8_target))
3804                     {
3805                         previous_prepend = locinput;
3806                         locinput += UTF8SKIP(locinput);
3807                     }
3808
3809                     /* As noted above, if we matched a prepend character, but
3810                      * the next thing won't match, back off the last prepend we
3811                      * matched, as it is guaranteed to match the begin */
3812                     if (previous_prepend
3813                         && (locinput >=  PL_regeol
3814                             || ! swash_fetch(PL_utf8_X_begin,
3815                                              (U8*)locinput, utf8_target)))
3816                     {
3817                         locinput = previous_prepend;
3818                     }
3819
3820                     /* Note that here we know PL_regeol > locinput, as we
3821                      * tested that upon input to this switch case, and if we
3822                      * moved locinput forward, we tested the result just above
3823                      * and it either passed, or we backed off so that it will
3824                      * now pass */
3825                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3826
3827                         /* Here did not match the required 'Begin' in the
3828                          * second term.  So just match the very first
3829                          * character, the '.' of the final term of the regex */
3830                         locinput = starting + UTF8SKIP(starting);
3831                     } else {
3832
3833                         /* Here is the beginning of a character that can have
3834                          * an extender.  It is either a hangul syllable, or a
3835                          * non-control */
3836                         if (swash_fetch(PL_utf8_X_non_hangul,
3837                                         (U8*)locinput, utf8_target))
3838                         {
3839
3840                             /* Here not a Hangul syllable, must be a
3841                              * ('!  * Control') */
3842                             locinput += UTF8SKIP(locinput);
3843                         } else {
3844
3845                             /* Here is a Hangul syllable.  It can be composed
3846                              * of several individual characters.  One
3847                              * possibility is T+ */
3848                             if (swash_fetch(PL_utf8_X_T,
3849                                             (U8*)locinput, utf8_target))
3850                             {
3851                                 while (locinput < PL_regeol
3852                                         && swash_fetch(PL_utf8_X_T,
3853                                                         (U8*)locinput, utf8_target))
3854                                 {
3855                                     locinput += UTF8SKIP(locinput);
3856                                 }
3857                             } else {
3858
3859                                 /* Here, not T+, but is a Hangul.  That means
3860                                  * it is one of the others: L, LV, LVT or V,
3861                                  * and matches:
3862                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3863
3864                                 /* Match L*           */
3865                                 while (locinput < PL_regeol
3866                                         && swash_fetch(PL_utf8_X_L,
3867                                                         (U8*)locinput, utf8_target))
3868                                 {
3869                                     locinput += UTF8SKIP(locinput);
3870                                 }
3871
3872                                 /* Here, have exhausted L*.  If the next
3873                                  * character is not an LV, LVT nor V, it means
3874                                  * we had to have at least one L, so matches L+
3875                                  * in the original equation, we have a complete
3876                                  * hangul syllable.  Are done. */
3877
3878                                 if (locinput < PL_regeol
3879                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3880                                                     (U8*)locinput, utf8_target))
3881                                 {
3882
3883                                     /* Otherwise keep going.  Must be LV, LVT
3884                                      * or V.  See if LVT */
3885                                     if (swash_fetch(PL_utf8_X_LVT,
3886                                                     (U8*)locinput, utf8_target))
3887                                     {
3888                                         locinput += UTF8SKIP(locinput);
3889                                     } else {
3890
3891                                         /* Must be  V or LV.  Take it, then
3892                                          * match V*     */
3893                                         locinput += UTF8SKIP(locinput);
3894                                         while (locinput < PL_regeol
3895                                                 && swash_fetch(PL_utf8_X_V,
3896                                                          (U8*)locinput, utf8_target))
3897                                         {
3898                                             locinput += UTF8SKIP(locinput);
3899                                         }
3900                                     }
3901
3902                                     /* And any of LV, LVT, or V can be followed
3903                                      * by T*            */
3904                                     while (locinput < PL_regeol
3905                                            && swash_fetch(PL_utf8_X_T,
3906                                                            (U8*)locinput,
3907                                                            utf8_target))
3908                                     {
3909                                         locinput += UTF8SKIP(locinput);
3910                                     }
3911                                 }
3912                             }
3913                         }
3914
3915                         /* Match any extender */
3916                         while (locinput < PL_regeol
3917                                 && swash_fetch(PL_utf8_X_extend,
3918                                                 (U8*)locinput, utf8_target))
3919                         {
3920                             locinput += UTF8SKIP(locinput);
3921                         }
3922                     }
3923                 }
3924                 if (locinput > PL_regeol) sayNO;
3925             }
3926             nextchr = UCHARAT(locinput);
3927             break;
3928
3929         case NREFFL:
3930         {   /* The capture buffer cases.  The ones beginning with N for the
3931                named buffers just convert to the equivalent numbered and
3932                pretend they were called as the corresponding numbered buffer
3933                op.  */
3934             /* don't initialize these, it makes C++ unhappy */
3935             char *s;
3936             char type;
3937             re_fold_t folder;
3938             const U8 *fold_array;
3939
3940             folder = NULL;      /* NULL assumes will be NREF, REF: no
3941                                    folding */
3942             fold_array = NULL;
3943
3944             PL_reg_flags |= RF_tainted;
3945             folder = foldEQ_locale;
3946             fold_array = PL_fold_locale;
3947             type = REFFL;
3948             goto do_nref;
3949
3950         case NREFFU:
3951             folder = foldEQ_latin1;
3952             fold_array = PL_fold_latin1;
3953             type = REFFU;
3954             goto do_nref;
3955
3956         case NREFF:
3957             folder = foldEQ;
3958             fold_array = PL_fold;
3959             type = REFF;
3960             goto do_nref;
3961
3962         case NREF:
3963             type = REF;
3964             folder = NULL;
3965             fold_array = NULL;
3966           do_nref:
3967
3968             /* For the named back references, find the corresponding buffer
3969              * number */
3970             n = reg_check_named_buff_matched(rex,scan);
3971
3972             if ( ! n ) {
3973                 sayNO;
3974             }
3975             goto do_nref_ref_common;
3976
3977         case REFFL:
3978             PL_reg_flags |= RF_tainted;
3979             folder = foldEQ_locale;
3980             fold_array = PL_fold_locale;
3981             goto do_ref;
3982
3983         case REFFU:
3984             folder = foldEQ_latin1;
3985             fold_array = PL_fold_latin1;
3986             goto do_ref;
3987
3988         case REFF:
3989             folder = foldEQ;
3990             fold_array = PL_fold;
3991             goto do_ref;
3992
3993         case REF:
3994             folder = NULL;
3995             fold_array = NULL;
3996
3997           do_ref:
3998             type = OP(scan);
3999             n = ARG(scan);  /* which paren pair */
4000
4001           do_nref_ref_common:
4002             ln = PL_regoffs[n].start;
4003             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4004             if (*PL_reglastparen < n || ln == -1)
4005                 sayNO;                  /* Do not match unless seen CLOSEn. */
4006             if (ln == PL_regoffs[n].end)
4007                 break;
4008
4009             s = PL_bostr + ln;
4010             if (type != REF     /* REF can do byte comparison */
4011                 && (utf8_target
4012                     || (type == REFFU
4013                         && (*s == (char) LATIN_SMALL_LETTER_SHARP_S
4014                             || *locinput == (char) LATIN_SMALL_LETTER_SHARP_S))))
4015             { /* XXX handle REFFL better */
4016                 char * limit = PL_regeol;
4017
4018                 /* This call case insensitively compares the entire buffer
4019                     * at s, with the current input starting at locinput, but
4020                     * not going off the end given by PL_regeol, and returns in
4021                     * limit upon success, how much of the current input was
4022                     * matched */
4023                 if (! foldEQ_utf8(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4024                                     locinput, &limit, 0, utf8_target))
4025                 {
4026                     sayNO;
4027                 }
4028                 locinput = limit;
4029                 nextchr = UCHARAT(locinput);
4030                 break;
4031             }
4032
4033             /* Not utf8:  Inline the first character, for speed. */
4034             if (UCHARAT(s) != nextchr &&
4035                 (type == REF ||
4036                  UCHARAT(s) != fold_array[nextchr]))
4037                 sayNO;
4038             ln = PL_regoffs[n].end - ln;
4039             if (locinput + ln > PL_regeol)
4040                 sayNO;
4041             if (ln > 1 && (type == REF
4042                            ? memNE(s, locinput, ln)
4043                            : ! folder(s, locinput, ln)))
4044                 sayNO;
4045             locinput += ln;
4046             nextchr = UCHARAT(locinput);
4047             break;
4048         }
4049         case NOTHING:
4050         case TAIL:
4051             break;
4052         case BACK:
4053             break;
4054
4055 #undef  ST
4056 #define ST st->u.eval
4057         {
4058             SV *ret;
4059             REGEXP *re_sv;
4060             regexp *re;
4061             regexp_internal *rei;
4062             regnode *startpoint;
4063
4064         case GOSTART:
4065         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4066             if (cur_eval && cur_eval->locinput==locinput) {
4067                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4068                     Perl_croak(aTHX_ "Infinite recursion in regex");
4069                 if ( ++nochange_depth > max_nochange_depth )
4070                     Perl_croak(aTHX_
4071                         "Pattern subroutine nesting without pos change"
4072                         " exceeded limit in regex");
4073             } else {
4074                 nochange_depth = 0;
4075             }
4076             re_sv = rex_sv;
4077             re = rex;
4078             rei = rexi;
4079             (void)ReREFCNT_inc(rex_sv);
4080             if (OP(scan)==GOSUB) {
4081                 startpoint = scan + ARG2L(scan);
4082                 ST.close_paren = ARG(scan);
4083             } else {
4084                 startpoint = rei->program+1;
4085                 ST.close_paren = 0;
4086             }
4087             goto eval_recurse_doit;
4088             /* NOTREACHED */
4089         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4090             if (cur_eval && cur_eval->locinput==locinput) {
4091                 if ( ++nochange_depth > max_nochange_depth )
4092                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4093             } else {
4094                 nochange_depth = 0;
4095             }
4096             {
4097                 /* execute the code in the {...} */
4098                 dSP;
4099                 SV ** const before = SP;
4100                 OP_4tree * const oop = PL_op;
4101                 COP * const ocurcop = PL_curcop;
4102                 PAD *old_comppad;
4103                 char *saved_regeol = PL_regeol;
4104                 struct re_save_state saved_state;
4105
4106                 /* To not corrupt the existing regex state while executing the
4107                  * eval we would normally put it on the save stack, like with
4108                  * save_re_context. However, re-evals have a weird scoping so we
4109                  * can't just add ENTER/LEAVE here. With that, things like
4110                  *
4111                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4112                  *
4113                  * would break, as they expect the localisation to be unwound
4114                  * only when the re-engine backtracks through the bit that
4115                  * localised it.
4116                  *
4117                  * What we do instead is just saving the state in a local c
4118                  * variable.
4119                  */
4120                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4121
4122                 n = ARG(scan);
4123                 PL_op = (OP_4tree*)rexi->data->data[n];
4124                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4125                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4126                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4127                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4128
4129                 if (sv_yes_mark) {
4130                     SV *sv_mrk = get_sv("REGMARK", 1);
4131                     sv_setsv(sv_mrk, sv_yes_mark);
4132                 }
4133
4134                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4135                 SPAGAIN;
4136                 if (SP == before)
4137                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4138                 else {
4139                     ret = POPs;
4140                     PUTBACK;
4141                 }
4142
4143                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4144
4145                 PL_op = oop;
4146                 PAD_RESTORE_LOCAL(old_comppad);
4147                 PL_curcop = ocurcop;
4148                 PL_regeol = saved_regeol;
4149                 if (!logical) {
4150                     /* /(?{...})/ */
4151                     sv_setsv(save_scalar(PL_replgv), ret);
4152                     break;
4153                 }
4154             }
4155             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4156                 logical = 0;
4157                 {
4158                     /* extract RE object from returned value; compiling if
4159                      * necessary */
4160                     MAGIC *mg = NULL;
4161                     REGEXP *rx = NULL;
4162
4163                     if (SvROK(ret)) {
4164                         SV *const sv = SvRV(ret);
4165
4166                         if (SvTYPE(sv) == SVt_REGEXP) {
4167                             rx = (REGEXP*) sv;
4168                         } else if (SvSMAGICAL(sv)) {
4169                             mg = mg_find(sv, PERL_MAGIC_qr);
4170                             assert(mg);
4171                         }
4172                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4173                         rx = (REGEXP*) ret;
4174                     } else if (SvSMAGICAL(ret)) {
4175                         if (SvGMAGICAL(ret)) {
4176                             /* I don't believe that there is ever qr magic
4177                                here.  */
4178                             assert(!mg_find(ret, PERL_MAGIC_qr));
4179                             sv_unmagic(ret, PERL_MAGIC_qr);
4180                         }
4181                         else {
4182                             mg = mg_find(ret, PERL_MAGIC_qr);
4183                             /* testing suggests mg only ends up non-NULL for
4184                                scalars who were upgraded and compiled in the
4185                                else block below. In turn, this is only
4186                                triggered in the "postponed utf8 string" tests
4187                                in t/op/pat.t  */
4188                         }
4189                     }
4190
4191                     if (mg) {
4192                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4193                         assert(rx);
4194                     }
4195                     if (rx) {
4196                         rx = reg_temp_copy(NULL, rx);
4197                     }
4198                     else {
4199                         U32 pm_flags = 0;
4200                         const I32 osize = PL_regsize;
4201
4202                         if (DO_UTF8(ret)) {
4203                             assert (SvUTF8(ret));
4204                         } else if (SvUTF8(ret)) {
4205                             /* Not doing UTF-8, despite what the SV says. Is
4206                                this only if we're trapped in use 'bytes'?  */
4207                             /* Make a copy of the octet sequence, but without
4208                                the flag on, as the compiler now honours the
4209                                SvUTF8 flag on ret.  */
4210                             STRLEN len;
4211                             const char *const p = SvPV(ret, len);
4212                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4213                         }
4214                         rx = CALLREGCOMP(ret, pm_flags);
4215                         if (!(SvFLAGS(ret)
4216                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4217                                  | SVs_GMG))) {
4218                             /* This isn't a first class regexp. Instead, it's
4219                                caching a regexp onto an existing, Perl visible
4220                                scalar.  */
4221                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4222                         }
4223                         PL_regsize = osize;
4224                     }
4225                     re_sv = rx;
4226                     re = (struct regexp *)SvANY(rx);
4227                 }
4228                 RXp_MATCH_COPIED_off(re);
4229                 re->subbeg = rex->subbeg;
4230                 re->sublen = rex->sublen;
4231                 rei = RXi_GET(re);
4232                 DEBUG_EXECUTE_r(
4233                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4234                         "Matching embedded");
4235                 );
4236                 startpoint = rei->program + 1;
4237                 ST.close_paren = 0; /* only used for GOSUB */
4238                 /* borrowed from regtry */
4239                 if (PL_reg_start_tmpl <= re->nparens) {
4240                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4241                     if(PL_reg_start_tmp)
4242                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4243                     else
4244                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4245                 }
4246
4247         eval_recurse_doit: /* Share code with GOSUB below this line */
4248                 /* run the pattern returned from (??{...}) */
4249                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4250                 REGCP_SET(ST.lastcp);
4251
4252                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4253
4254                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4255                 PL_reglastparen = &re->lastparen;
4256                 PL_reglastcloseparen = &re->lastcloseparen;
4257                 re->lastparen = 0;
4258                 re->lastcloseparen = 0;
4259
4260                 PL_reginput = locinput;
4261                 PL_regsize = 0;
4262
4263                 /* XXXX This is too dramatic a measure... */
4264                 PL_reg_maxiter = 0;
4265
4266                 ST.toggle_reg_flags = PL_reg_flags;
4267                 if (RX_UTF8(re_sv))
4268                     PL_reg_flags |= RF_utf8;
4269                 else
4270                     PL_reg_flags &= ~RF_utf8;
4271                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4272
4273                 ST.prev_rex = rex_sv;
4274                 ST.prev_curlyx = cur_curlyx;
4275                 SETREX(rex_sv,re_sv);
4276                 rex = re;
4277                 rexi = rei;
4278                 cur_curlyx = NULL;
4279                 ST.B = next;
4280                 ST.prev_eval = cur_eval;
4281                 cur_eval = st;
4282                 /* now continue from first node in postoned RE */
4283                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4284                 /* NOTREACHED */
4285             }
4286             /* logical is 1,   /(?(?{...})X|Y)/ */
4287             sw = cBOOL(SvTRUE(ret));
4288             logical = 0;
4289             break;
4290         }
4291
4292         case EVAL_AB: /* cleanup after a successful (??{A})B */
4293             /* note: this is called twice; first after popping B, then A */
4294             PL_reg_flags ^= ST.toggle_reg_flags;
4295             ReREFCNT_dec(rex_sv);
4296             SETREX(rex_sv,ST.prev_rex);
4297             rex = (struct regexp *)SvANY(rex_sv);
4298             rexi = RXi_GET(rex);
4299             regcpblow(ST.cp);
4300             cur_eval = ST.prev_eval;
4301             cur_curlyx = ST.prev_curlyx;
4302
4303             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4304             PL_reglastparen = &rex->lastparen;
4305             PL_reglastcloseparen = &rex->lastcloseparen;
4306             /* also update PL_regoffs */
4307             PL_regoffs = rex->offs;
4308
4309             /* XXXX This is too dramatic a measure... */
4310             PL_reg_maxiter = 0;
4311             if ( nochange_depth )
4312                 nochange_depth--;
4313             sayYES;
4314
4315
4316         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4317             /* note: this is called twice; first after popping B, then A */
4318             PL_reg_flags ^= ST.toggle_reg_flags;
4319             ReREFCNT_dec(rex_sv);
4320             SETREX(rex_sv,ST.prev_rex);
4321             rex = (struct regexp *)SvANY(rex_sv);
4322             rexi = RXi_GET(rex);
4323             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4324             PL_reglastparen = &rex->lastparen;
4325             PL_reglastcloseparen = &rex->lastcloseparen;
4326
4327             PL_reginput = locinput;
4328             REGCP_UNWIND(ST.lastcp);
4329             regcppop(rex);
4330             cur_eval = ST.prev_eval;
4331             cur_curlyx = ST.prev_curlyx;
4332             /* XXXX This is too dramatic a measure... */
4333             PL_reg_maxiter = 0;
4334             if ( nochange_depth )
4335                 nochange_depth--;
4336             sayNO_SILENT;
4337 #undef ST
4338
4339         case OPEN:
4340             n = ARG(scan);  /* which paren pair */
4341             PL_reg_start_tmp[n] = locinput;
4342             if (n > PL_regsize)
4343                 PL_regsize = n;
4344             lastopen = n;
4345             break;
4346         case CLOSE:
4347             n = ARG(scan);  /* which paren pair */
4348             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4349             PL_regoffs[n].end = locinput - PL_bostr;
4350             /*if (n > PL_regsize)
4351                 PL_regsize = n;*/
4352             if (n > *PL_reglastparen)
4353                 *PL_reglastparen = n;
4354             *PL_reglastcloseparen = n;
4355             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4356                 goto fake_end;
4357             }
4358             break;
4359         case ACCEPT:
4360             if (ARG(scan)){
4361                 regnode *cursor;
4362                 for (cursor=scan;
4363                      cursor && OP(cursor)!=END;
4364                      cursor=regnext(cursor))
4365                 {
4366                     if ( OP(cursor)==CLOSE ){
4367                         n = ARG(cursor);
4368                         if ( n <= lastopen ) {
4369                             PL_regoffs[n].start
4370                                 = PL_reg_start_tmp[n] - PL_bostr;
4371                             PL_regoffs[n].end = locinput - PL_bostr;
4372                             /*if (n > PL_regsize)
4373                             PL_regsize = n;*/
4374                             if (n > *PL_reglastparen)
4375                                 *PL_reglastparen = n;
4376                             *PL_reglastcloseparen = n;
4377                             if ( n == ARG(scan) || (cur_eval &&
4378                                 cur_eval->u.eval.close_paren == n))
4379                                 break;
4380                         }
4381                     }
4382                 }
4383             }
4384             goto fake_end;
4385             /*NOTREACHED*/
4386         case GROUPP:
4387             n = ARG(scan);  /* which paren pair */
4388             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4389             break;
4390         case NGROUPP:
4391             /* reg_check_named_buff_matched returns 0 for no match */
4392             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4393             break;
4394         case INSUBP:
4395             n = ARG(scan);
4396             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4397             break;
4398         case DEFINEP:
4399             sw = 0;
4400             break;
4401         case IFTHEN:
4402             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4403             if (sw)
4404                 next = NEXTOPER(NEXTOPER(scan));
4405             else {
4406                 next = scan + ARG(scan);
4407                 if (OP(next) == IFTHEN) /* Fake one. */
4408                     next = NEXTOPER(NEXTOPER(next));
4409             }
4410             break;
4411         case LOGICAL:
4412             logical = scan->flags;
4413             break;
4414
4415 /*******************************************************************
4416
4417 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4418 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4419 STAR/PLUS/CURLY/CURLYN are used instead.)
4420
4421 A*B is compiled as <CURLYX><A><WHILEM><B>
4422
4423 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4424 state, which contains the current count, initialised to -1. It also sets
4425 cur_curlyx to point to this state, with any previous value saved in the
4426 state block.
4427
4428 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4429 since the pattern may possibly match zero times (i.e. it's a while {} loop
4430 rather than a do {} while loop).
4431
4432 Each entry to WHILEM represents a successful match of A. The count in the
4433 CURLYX block is incremented, another WHILEM state is pushed, and execution
4434 passes to A or B depending on greediness and the current count.
4435
4436 For example, if matching against the string a1a2a3b (where the aN are
4437 substrings that match /A/), then the match progresses as follows: (the
4438 pushed states are interspersed with the bits of strings matched so far):
4439
4440     <CURLYX cnt=-1>
4441     <CURLYX cnt=0><WHILEM>
4442     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4443     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4444     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4445     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4446
4447 (Contrast this with something like CURLYM, which maintains only a single
4448 backtrack state:
4449
4450     <CURLYM cnt=0> a1
4451     a1 <CURLYM cnt=1> a2
4452     a1 a2 <CURLYM cnt=2> a3
4453     a1 a2 a3 <CURLYM cnt=3> b
4454 )
4455
4456 Each WHILEM state block marks a point to backtrack to upon partial failure
4457 of A or B, and also contains some minor state data related to that
4458 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4459 overall state, such as the count, and pointers to the A and B ops.
4460
4461 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4462 must always point to the *current* CURLYX block, the rules are:
4463
4464 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4465 and set cur_curlyx to point the new block.
4466
4467 When popping the CURLYX block after a successful or unsuccessful match,
4468 restore the previous cur_curlyx.
4469
4470 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4471 to the outer one saved in the CURLYX block.
4472
4473 When popping the WHILEM block after a successful or unsuccessful B match,
4474 restore the previous cur_curlyx.
4475
4476 Here's an example for the pattern (AI* BI)*BO
4477 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4478
4479 cur_
4480 curlyx backtrack stack
4481 ------ ---------------
4482 NULL
4483 CO     <CO prev=NULL> <WO>
4484 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4485 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4486 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4487
4488 At this point the pattern succeeds, and we work back down the stack to
4489 clean up, restoring as we go:
4490
4491 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4492 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4493 CO     <CO prev=NULL> <WO>
4494 NULL
4495
4496 *******************************************************************/
4497
4498 #define ST st->u.curlyx
4499
4500         case CURLYX:    /* start of /A*B/  (for complex A) */
4501         {
4502             /* No need to save/restore up to this paren */
4503             I32 parenfloor = scan->flags;
4504
4505             assert(next); /* keep Coverity happy */
4506             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4507                 next += ARG(next);
4508
4509             /* XXXX Probably it is better to teach regpush to support
4510                parenfloor > PL_regsize... */
4511             if (parenfloor > (I32)*PL_reglastparen)
4512                 parenfloor = *PL_reglastparen; /* Pessimization... */
4513
4514             ST.prev_curlyx= cur_curlyx;
4515             cur_curlyx = st;
4516             ST.cp = PL_savestack_ix;
4517
4518             /* these fields contain the state of the current curly.
4519              * they are accessed by subsequent WHILEMs */
4520             ST.parenfloor = parenfloor;
4521             ST.me = scan;
4522             ST.B = next;
4523             ST.minmod = minmod;
4524             minmod = 0;
4525             ST.count = -1;      /* this will be updated by WHILEM */
4526             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4527
4528             PL_reginput = locinput;
4529             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4530             /* NOTREACHED */
4531         }
4532
4533         case CURLYX_end: /* just finished matching all of A*B */
4534             cur_curlyx = ST.prev_curlyx;
4535             sayYES;
4536             /* NOTREACHED */
4537
4538         case CURLYX_end_fail: /* just failed to match all of A*B */
4539             regcpblow(ST.cp);
4540             cur_curlyx = ST.prev_curlyx;
4541             sayNO;
4542             /* NOTREACHED */
4543
4544
4545 #undef ST
4546 #define ST st->u.whilem
4547
4548         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4549         {
4550             /* see the discussion above about CURLYX/WHILEM */
4551             I32 n;
4552             int min = ARG1(cur_curlyx->u.curlyx.me);
4553             int max = ARG2(cur_curlyx->u.curlyx.me);
4554             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4555
4556             assert(cur_curlyx); /* keep Coverity happy */
4557             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4558             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4559             ST.cache_offset = 0;
4560             ST.cache_mask = 0;
4561
4562             PL_reginput = locinput;
4563
4564             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4565                   "%*s  whilem: matched %ld out of %d..%d\n",
4566                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4567             );
4568
4569             /* First just match a string of min A's. */
4570
4571             if (n < min) {
4572                 cur_curlyx->u.curlyx.lastloc = locinput;
4573                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4574                 /* NOTREACHED */
4575             }
4576
4577             /* If degenerate A matches "", assume A done. */
4578
4579             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4580                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4581                    "%*s  whilem: empty match detected, trying continuation...\n",
4582                    REPORT_CODE_OFF+depth*2, "")
4583                 );
4584                 goto do_whilem_B_max;
4585             }
4586
4587             /* super-linear cache processing */
4588
4589             if (scan->flags) {
4590
4591                 if (!PL_reg_maxiter) {
4592                     /* start the countdown: Postpone detection until we
4593                      * know the match is not *that* much linear. */
4594                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4595                     /* possible overflow for long strings and many CURLYX's */
4596                     if (PL_reg_maxiter < 0)
4597                         PL_reg_maxiter = I32_MAX;
4598                     PL_reg_leftiter = PL_reg_maxiter;
4599                 }
4600
4601                 if (PL_reg_leftiter-- == 0) {
4602                     /* initialise cache */
4603                     const I32 size = (PL_reg_maxiter + 7)/8;
4604                     if (PL_reg_poscache) {
4605                         if ((I32)PL_reg_poscache_size < size) {
4606                             Renew(PL_reg_poscache, size, char);
4607                             PL_reg_poscache_size = size;
4608                         }
4609                         Zero(PL_reg_poscache, size, char);
4610                     }
4611                     else {
4612                         PL_reg_poscache_size = size;
4613                         Newxz(PL_reg_poscache, size, char);
4614                     }
4615                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4616       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4617                               PL_colors[4], PL_colors[5])
4618                     );
4619                 }
4620
4621                 if (PL_reg_leftiter < 0) {
4622                     /* have we already failed at this position? */
4623                     I32 offset, mask;
4624                     offset  = (scan->flags & 0xf) - 1
4625                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4626                     mask    = 1 << (offset % 8);
4627                     offset /= 8;
4628                     if (PL_reg_poscache[offset] & mask) {
4629                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4630                             "%*s  whilem: (cache) already tried at this position...\n",
4631                             REPORT_CODE_OFF+depth*2, "")
4632                         );
4633                         sayNO; /* cache records failure */
4634                     }
4635                     ST.cache_offset = offset;
4636                     ST.cache_mask   = mask;
4637                 }
4638             }
4639
4640             /* Prefer B over A for minimal matching. */
4641
4642             if (cur_curlyx->u.curlyx.minmod) {
4643                 ST.save_curlyx = cur_curlyx;
4644                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4645                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4646                 REGCP_SET(ST.lastcp);
4647                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4648                 /* NOTREACHED */
4649             }
4650
4651             /* Prefer A over B for maximal matching. */
4652
4653             if (n < max) { /* More greed allowed? */
4654                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4655                 cur_curlyx->u.curlyx.lastloc = locinput;
4656                 REGCP_SET(ST.lastcp);
4657                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4658                 /* NOTREACHED */
4659             }
4660             goto do_whilem_B_max;
4661         }
4662         /* NOTREACHED */
4663
4664         case WHILEM_B_min: /* just matched B in a minimal match */
4665         case WHILEM_B_max: /* just matched B in a maximal match */
4666             cur_curlyx = ST.save_curlyx;
4667             sayYES;
4668             /* NOTREACHED */
4669
4670         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4671             cur_curlyx = ST.save_curlyx;
4672             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4673             cur_curlyx->u.curlyx.count--;
4674             CACHEsayNO;
4675             /* NOTREACHED */
4676
4677         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4678             REGCP_UNWIND(ST.lastcp);
4679             regcppop(rex);
4680             /* FALL THROUGH */
4681         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4682             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4683             cur_curlyx->u.curlyx.count--;
4684             CACHEsayNO;
4685             /* NOTREACHED */
4686
4687         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4688             REGCP_UNWIND(ST.lastcp);
4689             regcppop(rex);      /* Restore some previous $<digit>s? */
4690             PL_reginput = locinput;
4691             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4692                 "%*s  whilem: failed, trying continuation...\n",
4693                 REPORT_CODE_OFF+depth*2, "")
4694             );
4695           do_whilem_B_max:
4696             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4697                 && ckWARN(WARN_REGEXP)
4698                 && !(PL_reg_flags & RF_warned))
4699             {
4700                 PL_reg_flags |= RF_warned;
4701                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4702                      "Complex regular subexpression recursion",
4703                      REG_INFTY - 1);
4704             }
4705
4706             /* now try B */
4707             ST.save_curlyx = cur_curlyx;
4708             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4709             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4710             /* NOTREACHED */
4711
4712         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4713             cur_curlyx = ST.save_curlyx;
4714             REGCP_UNWIND(ST.lastcp);
4715             regcppop(rex);
4716
4717             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4718                 /* Maximum greed exceeded */
4719                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4720                     && ckWARN(WARN_REGEXP)
4721                     && !(PL_reg_flags & RF_warned))
4722                 {
4723                     PL_reg_flags |= RF_warned;
4724                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4725                         "%s limit (%d) exceeded",
4726                         "Complex regular subexpression recursion",
4727                         REG_INFTY - 1);
4728                 }
4729                 cur_curlyx->u.curlyx.count--;
4730                 CACHEsayNO;
4731             }
4732
4733             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4734                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4735             );
4736             /* Try grabbing another A and see if it helps. */
4737             PL_reginput = locinput;
4738             cur_curlyx->u.curlyx.lastloc = locinput;
4739             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4740             REGCP_SET(ST.lastcp);
4741             PUSH_STATE_GOTO(WHILEM_A_min,
4742                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4743             /* NOTREACHED */
4744
4745 #undef  ST
4746 #define ST st->u.branch
4747
4748         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4749             next = scan + ARG(scan);
4750             if (next == scan)
4751                 next = NULL;
4752             scan = NEXTOPER(scan);
4753             /* FALL THROUGH */
4754
4755         case BRANCH:        /*  /(...|A|...)/ */
4756             scan = NEXTOPER(scan); /* scan now points to inner node */
4757             ST.lastparen = *PL_reglastparen;
4758             ST.next_branch = next;
4759             REGCP_SET(ST.cp);
4760             PL_reginput = locinput;
4761
4762             /* Now go into the branch */
4763             if (has_cutgroup) {
4764                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4765             } else {
4766                 PUSH_STATE_GOTO(BRANCH_next, scan);
4767             }
4768             /* NOTREACHED */
4769         case CUTGROUP:
4770             PL_reginput = locinput;
4771             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4772                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4773             PUSH_STATE_GOTO(CUTGROUP_next,next);
4774             /* NOTREACHED */
4775         case CUTGROUP_next_fail:
4776             do_cutgroup = 1;
4777             no_final = 1;
4778             if (st->u.mark.mark_name)
4779                 sv_commit = st->u.mark.mark_name;
4780             sayNO;
4781             /* NOTREACHED */
4782         case BRANCH_next:
4783             sayYES;
4784             /* NOTREACHED */
4785         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4786             if (do_cutgroup) {
4787                 do_cutgroup = 0;
4788                 no_final = 0;
4789             }
4790             REGCP_UNWIND(ST.cp);
4791             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4792                 PL_regoffs[n].end = -1;
4793             *PL_reglastparen = n;
4794             /*dmq: *PL_reglastcloseparen = n; */
4795             scan = ST.next_branch;
4796             /* no more branches? */
4797             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4798                 DEBUG_EXECUTE_r({
4799                     PerlIO_printf( Perl_debug_log,
4800                         "%*s  %sBRANCH failed...%s\n",
4801                         REPORT_CODE_OFF+depth*2, "",
4802                         PL_colors[4],
4803                         PL_colors[5] );
4804                 });
4805                 sayNO_SILENT;
4806             }
4807             continue; /* execute next BRANCH[J] op */
4808             /* NOTREACHED */
4809
4810         case MINMOD:
4811             minmod = 1;
4812             break;
4813
4814 #undef  ST
4815 #define ST st->u.curlym
4816
4817         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4818
4819             /* This is an optimisation of CURLYX that enables us to push
4820              * only a single backtracking state, no matter how many matches
4821              * there are in {m,n}. It relies on the pattern being constant
4822              * length, with no parens to influence future backrefs
4823              */
4824
4825             ST.me = scan;
4826             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4827
4828             /* if paren positive, emulate an OPEN/CLOSE around A */
4829             if (ST.me->flags) {
4830                 U32 paren = ST.me->flags;
4831                 if (paren > PL_regsize)
4832                     PL_regsize = paren;
4833                 if (paren > *PL_reglastparen)
4834                     *PL_reglastparen = paren;
4835                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4836             }
4837             ST.A = scan;
4838             ST.B = next;
4839             ST.alen = 0;
4840             ST.count = 0;
4841             ST.minmod = minmod;
4842             minmod = 0;
4843             ST.c1 = CHRTEST_UNINIT;
4844             REGCP_SET(ST.cp);
4845
4846             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4847                 goto curlym_do_B;
4848
4849           curlym_do_A: /* execute the A in /A{m,n}B/  */
4850             PL_reginput = locinput;
4851             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4852             /* NOTREACHED */
4853
4854         case CURLYM_A: /* we've just matched an A */
4855             locinput = st->locinput;
4856             nextchr = UCHARAT(locinput);
4857
4858             ST.count++;
4859             /* after first match, determine A's length: u.curlym.alen */
4860             if (ST.count == 1) {
4861                 if (PL_reg_match_utf8) {
4862                     char *s = locinput;
4863                     while (s < PL_reginput) {
4864                         ST.alen++;
4865                         s += UTF8SKIP(s);
4866                     }
4867                 }
4868                 else {
4869                     ST.alen = PL_reginput - locinput;
4870                 }
4871                 if (ST.alen == 0)
4872                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4873             }
4874             DEBUG_EXECUTE_r(
4875                 PerlIO_printf(Perl_debug_log,
4876                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4877                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4878                           (IV) ST.count, (IV)ST.alen)
4879             );
4880
4881             locinput = PL_reginput;
4882
4883             if (cur_eval && cur_eval->u.eval.close_paren &&
4884                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4885                 goto fake_end;
4886
4887             {
4888                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4889                 if ( max == REG_INFTY || ST.count < max )
4890                     goto curlym_do_A; /* try to match another A */
4891             }
4892             goto curlym_do_B; /* try to match B */
4893
4894         case CURLYM_A_fail: /* just failed to match an A */
4895             REGCP_UNWIND(ST.cp);
4896
4897             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4898                 || (cur_eval && cur_eval->u.eval.close_paren &&
4899                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4900                 sayNO;
4901
4902           curlym_do_B: /* execute the B in /A{m,n}B/  */
4903             PL_reginput = locinput;
4904             if (ST.c1 == CHRTEST_UNINIT) {
4905                 /* calculate c1 and c2 for possible match of 1st char
4906                  * following curly */
4907                 ST.c1 = ST.c2 = CHRTEST_VOID;
4908                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4909                     regnode *text_node = ST.B;
4910                     if (! HAS_TEXT(text_node))
4911                         FIND_NEXT_IMPT(text_node);
4912                     /* this used to be
4913
4914                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4915
4916                         But the former is redundant in light of the latter.
4917
4918                         if this changes back then the macro for
4919                         IS_TEXT and friends need to change.
4920                      */
4921                     if (PL_regkind[OP(text_node)] == EXACT)
4922                     {
4923
4924                         ST.c1 = (U8)*STRING(text_node);
4925                         switch (OP(text_node)) {
4926                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
4927                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
4928                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
4929                             default: ST.c2 = ST.c1;
4930                         }
4931                     }
4932                 }
4933             }
4934
4935             DEBUG_EXECUTE_r(
4936                 PerlIO_printf(Perl_debug_log,
4937                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4938                     (int)(REPORT_CODE_OFF+(depth*2)),
4939                     "", (IV)ST.count)
4940                 );
4941             if (ST.c1 != CHRTEST_VOID
4942                     && UCHARAT(PL_reginput) != ST.c1
4943                     && UCHARAT(PL_reginput) != ST.c2)
4944             {
4945                 /* simulate B failing */
4946                 DEBUG_OPTIMISE_r(
4947                     PerlIO_printf(Perl_debug_log,
4948                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4949                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4950                         (IV)ST.c1,(IV)ST.c2
4951                 ));
4952                 state_num = CURLYM_B_fail;
4953                 goto reenter_switch;
4954             }
4955
4956             if (ST.me->flags) {
4957                 /* mark current A as captured */
4958                 I32 paren = ST.me->flags;
4959                 if (ST.count) {
4960                     PL_regoffs[paren].start
4961                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4962                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4963                     /*dmq: *PL_reglastcloseparen = paren; */
4964                 }
4965                 else
4966                     PL_regoffs[paren].end = -1;
4967                 if (cur_eval && cur_eval->u.eval.close_paren &&
4968                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4969                 {
4970                     if (ST.count)
4971                         goto fake_end;
4972                     else
4973                         sayNO;
4974                 }
4975             }
4976
4977             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4978             /* NOTREACHED */
4979
4980         case CURLYM_B_fail: /* just failed to match a B */
4981             REGCP_UNWIND(ST.cp);
4982             if (ST.minmod) {
4983                 I32 max = ARG2(ST.me);
4984                 if (max != REG_INFTY && ST.count == max)
4985                     sayNO;
4986                 goto curlym_do_A; /* try to match a further A */
4987             }
4988             /* backtrack one A */
4989             if (ST.count == ARG1(ST.me) /* min */)
4990                 sayNO;
4991             ST.count--;
4992             locinput = HOPc(locinput, -ST.alen);
4993             goto curlym_do_B; /* try to match B */
4994
4995 #undef ST
4996 #define ST st->u.curly
4997
4998 #define CURLY_SETPAREN(paren, success) \
4999     if (paren) { \
5000         if (success) { \
5001             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5002             PL_regoffs[paren].end = locinput - PL_bostr; \
5003             *PL_reglastcloseparen = paren; \
5004         } \
5005         else \
5006             PL_regoffs[paren].end = -1; \
5007     }
5008
5009         case STAR:              /*  /A*B/ where A is width 1 */
5010             ST.paren = 0;
5011             ST.min = 0;
5012             ST.max = REG_INFTY;
5013             scan = NEXTOPER(scan);
5014             goto repeat;
5015         case PLUS:              /*  /A+B/ where A is width 1 */
5016             ST.paren = 0;
5017             ST.min = 1;
5018             ST.max = REG_INFTY;
5019             scan = NEXTOPER(scan);
5020             goto repeat;
5021         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5022             ST.paren = scan->flags;     /* Which paren to set */
5023             if (ST.paren > PL_regsize)
5024                 PL_regsize = ST.paren;
5025             if (ST.paren > *PL_reglastparen)
5026                 *PL_reglastparen = ST.paren;
5027             ST.min = ARG1(scan);  /* min to match */
5028             ST.max = ARG2(scan);  /* max to match */
5029             if (cur_eval && cur_eval->u.eval.close_paren &&
5030                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5031                 ST.min=1;
5032                 ST.max=1;
5033             }
5034             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5035             goto repeat;
5036         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5037             ST.paren = 0;
5038             ST.min = ARG1(scan);  /* min to match */
5039             ST.max = ARG2(scan);  /* max to match */
5040             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5041           repeat:
5042             /*
5043             * Lookahead to avoid useless match attempts
5044             * when we know what character comes next.
5045             *
5046             * Used to only do .*x and .*?x, but now it allows
5047             * for )'s, ('s and (?{ ... })'s to be in the way
5048             * of the quantifier and the EXACT-like node.  -- japhy
5049             */
5050
5051             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5052                 sayNO;
5053             if (HAS_TEXT(next) || JUMPABLE(next)) {
5054                 U8 *s;
5055                 regnode *text_node = next;
5056
5057                 if (! HAS_TEXT(text_node))
5058                     FIND_NEXT_IMPT(text_node);
5059
5060                 if (! HAS_TEXT(text_node))
5061                     ST.c1 = ST.c2 = CHRTEST_VOID;
5062                 else {
5063                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5064                         ST.c1 = ST.c2 = CHRTEST_VOID;
5065                         goto assume_ok_easy;
5066                     }
5067                     else
5068                         s = (U8*)STRING(text_node);
5069
5070                     /*  Currently we only get here when
5071
5072                         PL_rekind[OP(text_node)] == EXACT
5073
5074                         if this changes back then the macro for IS_TEXT and
5075                         friends need to change. */
5076                     if (!UTF_PATTERN) {
5077                         ST.c1 = *s;
5078                         switch (OP(text_node)) {
5079                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5080                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5081                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5082                             default: ST.c2 = ST.c1; break;
5083                         }
5084                     }
5085                     else { /* UTF_PATTERN */
5086                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5087                              STRLEN ulen1, ulen2;
5088                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5089                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5090
5091                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5092                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5093 #ifdef EBCDIC
5094                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5095                                                     ckWARN(WARN_UTF8) ?
5096                                                     0 : UTF8_ALLOW_ANY);
5097                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5098                                                     ckWARN(WARN_UTF8) ?
5099                                                     0 : UTF8_ALLOW_ANY);
5100 #else
5101                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5102                                                     uniflags);
5103                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5104                                                     uniflags);
5105 #endif
5106                         }
5107                         else {
5108                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5109                                                      uniflags);
5110                         }
5111                     }
5112                 }
5113             }
5114             else
5115                 ST.c1 = ST.c2 = CHRTEST_VOID;
5116         assume_ok_easy:
5117
5118             ST.A = scan;
5119             ST.B = next;
5120             PL_reginput = locinput;
5121             if (minmod) {
5122                 minmod = 0;
5123                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5124                     sayNO;
5125                 ST.count = ST.min;
5126                 locinput = PL_reginput;
5127                 REGCP_SET(ST.cp);
5128                 if (ST.c1 == CHRTEST_VOID)
5129                     goto curly_try_B_min;
5130
5131                 ST.oldloc = locinput;
5132
5133                 /* set ST.maxpos to the furthest point along the
5134                  * string that could possibly match */
5135                 if  (ST.max == REG_INFTY) {
5136                     ST.maxpos = PL_regeol - 1;
5137                     if (utf8_target)
5138                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5139                             ST.maxpos--;
5140                 }
5141                 else if (utf8_target) {
5142                     int m = ST.max - ST.min;
5143                     for (ST.maxpos = locinput;
5144                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5145                         ST.maxpos += UTF8SKIP(ST.maxpos);
5146                 }
5147                 else {
5148                     ST.maxpos = locinput + ST.max - ST.min;
5149                     if (ST.maxpos >= PL_regeol)
5150                         ST.maxpos = PL_regeol - 1;
5151                 }
5152                 goto curly_try_B_min_known;
5153
5154             }
5155             else {
5156                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5157                 locinput = PL_reginput;
5158                 if (ST.count < ST.min)
5159                     sayNO;
5160                 if ((ST.count > ST.min)
5161                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5162                 {
5163                     /* A{m,n} must come at the end of the string, there's
5164                      * no point in backing off ... */
5165                     ST.min = ST.count;
5166                     /* ...except that $ and \Z can match before *and* after
5167                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5168                        We may back off by one in this case. */
5169                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5170                         ST.min--;
5171                 }
5172                 REGCP_SET(ST.cp);
5173                 goto curly_try_B_max;
5174             }
5175             /* NOTREACHED */
5176
5177
5178         case CURLY_B_min_known_fail:
5179             /* failed to find B in a non-greedy match where c1,c2 valid */
5180             if (ST.paren && ST.count)
5181                 PL_regoffs[ST.paren].end = -1;
5182
5183             PL_reginput = locinput;     /* Could be reset... */
5184             REGCP_UNWIND(ST.cp);
5185             /* Couldn't or didn't -- move forward. */
5186             ST.oldloc = locinput;
5187             if (utf8_target)
5188                 locinput += UTF8SKIP(locinput);
5189             else
5190                 locinput++;
5191             ST.count++;
5192           curly_try_B_min_known:
5193              /* find the next place where 'B' could work, then call B */
5194             {
5195                 int n;
5196                 if (utf8_target) {
5197                     n = (ST.oldloc == locinput) ? 0 : 1;
5198                     if (ST.c1 == ST.c2) {
5199                         STRLEN len;
5200                         /* set n to utf8_distance(oldloc, locinput) */
5201                         while (locinput <= ST.maxpos &&
5202                                utf8n_to_uvchr((U8*)locinput,
5203                                               UTF8_MAXBYTES, &len,
5204                                               uniflags) != (UV)ST.c1) {
5205                             locinput += len;
5206                             n++;
5207                         }
5208                     }
5209                     else {
5210                         /* set n to utf8_distance(oldloc, locinput) */
5211                         while (locinput <= ST.maxpos) {
5212                             STRLEN len;
5213                             const UV c = utf8n_to_uvchr((U8*)locinput,
5214                                                   UTF8_MAXBYTES, &len,
5215                                                   uniflags);
5216                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5217                                 break;
5218                             locinput += len;
5219                             n++;
5220                         }
5221                     }
5222                 }
5223                 else {
5224                     if (ST.c1 == ST.c2) {
5225                         while (locinput <= ST.maxpos &&
5226                                UCHARAT(locinput) != ST.c1)
5227                             locinput++;
5228                     }
5229                     else {
5230                         while (locinput <= ST.maxpos
5231                                && UCHARAT(locinput) != ST.c1
5232                                && UCHARAT(locinput) != ST.c2)
5233                             locinput++;
5234                     }
5235                     n = locinput - ST.oldloc;
5236                 }
5237                 if (locinput > ST.maxpos)
5238                     sayNO;
5239                 /* PL_reginput == oldloc now */
5240                 if (n) {
5241                     ST.count += n;
5242                     if (regrepeat(rex, ST.A, n, depth) < n)
5243                         sayNO;
5244                 }
5245                 PL_reginput = locinput;
5246                 CURLY_SETPAREN(ST.paren, ST.count);
5247                 if (cur_eval && cur_eval->u.eval.close_paren &&
5248                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5249                     goto fake_end;
5250                 }
5251                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5252             }
5253             /* NOTREACHED */
5254
5255
5256         case CURLY_B_min_fail:
5257             /* failed to find B in a non-greedy match where c1,c2 invalid */
5258             if (ST.paren && ST.count)
5259                 PL_regoffs[ST.paren].end = -1;
5260
5261             REGCP_UNWIND(ST.cp);
5262             /* failed -- move forward one */
5263             PL_reginput = locinput;
5264             if (regrepeat(rex, ST.A, 1, depth)) {
5265                 ST.count++;
5266                 locinput = PL_reginput;
5267                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5268                         ST.count > 0)) /* count overflow ? */
5269                 {
5270                   curly_try_B_min:
5271                     CURLY_SETPAREN(ST.paren, ST.count);
5272                     if (cur_eval && cur_eval->u.eval.close_paren &&
5273                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5274                         goto fake_end;
5275                     }
5276                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5277                 }
5278             }
5279             sayNO;
5280             /* NOTREACHED */
5281
5282
5283         curly_try_B_max:
5284             /* a successful greedy match: now try to match B */
5285             if (cur_eval && cur_eval->u.eval.close_paren &&
5286                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5287                 goto fake_end;
5288             }
5289             {
5290                 UV c = 0;
5291                 if (ST.c1 != CHRTEST_VOID)
5292                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5293                                            UTF8_MAXBYTES, 0, uniflags)
5294                                 : (UV) UCHARAT(PL_reginput);
5295                 /* If it could work, try it. */
5296                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5297                     CURLY_SETPAREN(ST.paren, ST.count);
5298                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5299                     /* NOTREACHED */
5300                 }
5301             }
5302             /* FALL THROUGH */
5303         case CURLY_B_max_fail:
5304             /* failed to find B in a greedy match */
5305             if (ST.paren && ST.count)
5306                 PL_regoffs[ST.paren].end = -1;
5307
5308             REGCP_UNWIND(ST.cp);
5309             /*  back up. */
5310             if (--ST.count < ST.min)
5311                 sayNO;
5312             PL_reginput = locinput = HOPc(locinput, -1);
5313             goto curly_try_B_max;
5314
5315 #undef ST
5316
5317         case END:
5318             fake_end:
5319             if (cur_eval) {
5320                 /* we've just finished A in /(??{A})B/; now continue with B */
5321                 I32 tmpix;
5322                 st->u.eval.toggle_reg_flags
5323                             = cur_eval->u.eval.toggle_reg_flags;
5324                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5325
5326                 st->u.eval.prev_rex = rex_sv;           /* inner */
5327                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5328                 rex = (struct regexp *)SvANY(rex_sv);
5329                 rexi = RXi_GET(rex);
5330                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5331                 ReREFCNT_inc(rex_sv);
5332                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5333
5334                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5335                 PL_reglastparen = &rex->lastparen;
5336                 PL_reglastcloseparen = &rex->lastcloseparen;
5337
5338                 REGCP_SET(st->u.eval.lastcp);
5339                 PL_reginput = locinput;
5340
5341                 /* Restore parens of the outer rex without popping the
5342                  * savestack */
5343                 tmpix = PL_savestack_ix;
5344                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5345                 regcppop(rex);
5346                 PL_savestack_ix = tmpix;
5347
5348                 st->u.eval.prev_eval = cur_eval;
5349                 cur_eval = cur_eval->u.eval.prev_eval;
5350                 DEBUG_EXECUTE_r(
5351                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5352                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5353                 if ( nochange_depth )
5354                     nochange_depth--;
5355
5356                 PUSH_YES_STATE_GOTO(EVAL_AB,
5357                         st->u.eval.prev_eval->u.eval.B); /* match B */
5358             }
5359
5360             if (locinput < reginfo->till) {
5361                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5362                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5363                                       PL_colors[4],
5364                                       (long)(locinput - PL_reg_starttry),
5365                                       (long)(reginfo->till - PL_reg_starttry),
5366                                       PL_colors[5]));
5367
5368                 sayNO_SILENT;           /* Cannot match: too short. */
5369             }
5370             PL_reginput = locinput;     /* put where regtry can find it */
5371             sayYES;                     /* Success! */
5372
5373         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5374             DEBUG_EXECUTE_r(
5375             PerlIO_printf(Perl_debug_log,
5376                 "%*s  %ssubpattern success...%s\n",
5377                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5378             PL_reginput = locinput;     /* put where regtry can find it */
5379             sayYES;                     /* Success! */
5380
5381 #undef  ST
5382 #define ST st->u.ifmatch
5383
5384         case SUSPEND:   /* (?>A) */
5385             ST.wanted = 1;
5386             PL_reginput = locinput;
5387             goto do_ifmatch;
5388
5389         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5390             ST.wanted = 0;
5391             goto ifmatch_trivial_fail_test;
5392
5393         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5394             ST.wanted = 1;
5395           ifmatch_trivial_fail_test:
5396             if (scan->flags) {
5397                 char * const s = HOPBACKc(locinput, scan->flags);
5398                 if (!s) {
5399                     /* trivial fail */
5400                     if (logical) {
5401                         logical = 0;
5402                         sw = 1 - cBOOL(ST.wanted);
5403                     }
5404                     else if (ST.wanted)
5405                         sayNO;
5406                     next = scan + ARG(scan);
5407                     if (next == scan)
5408                         next = NULL;
5409                     break;
5410                 }
5411                 PL_reginput = s;
5412             }
5413             else
5414                 PL_reginput = locinput;
5415
5416           do_ifmatch:
5417             ST.me = scan;
5418             ST.logical = logical;
5419             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5420
5421             /* execute body of (?...A) */
5422             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5423             /* NOTREACHED */
5424
5425         case IFMATCH_A_fail: /* body of (?...A) failed */
5426             ST.wanted = !ST.wanted;
5427             /* FALL THROUGH */
5428
5429         case IFMATCH_A: /* body of (?...A) succeeded */
5430             if (ST.logical) {
5431                 sw = cBOOL(ST.wanted);
5432             }
5433             else if (!ST.wanted)
5434                 sayNO;
5435
5436             if (OP(ST.me) == SUSPEND)
5437                 locinput = PL_reginput;
5438             else {
5439                 locinput = PL_reginput = st->locinput;
5440                 nextchr = UCHARAT(locinput);
5441             }
5442             scan = ST.me + ARG(ST.me);
5443             if (scan == ST.me)
5444                 scan = NULL;
5445             continue; /* execute B */
5446
5447 #undef ST
5448
5449         case LONGJMP:
5450             next = scan + ARG(scan);
5451             if (next == scan)
5452                 next = NULL;
5453             break;
5454         case COMMIT:
5455             reginfo->cutpoint = PL_regeol;
5456             /* FALLTHROUGH */
5457         case PRUNE:
5458             PL_reginput = locinput;
5459             if (!scan->flags)
5460                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5461             PUSH_STATE_GOTO(COMMIT_next,next);
5462             /* NOTREACHED */
5463         case COMMIT_next_fail:
5464             no_final = 1;
5465             /* FALLTHROUGH */
5466         case OPFAIL:
5467             sayNO;
5468             /* NOTREACHED */
5469
5470 #define ST st->u.mark
5471         case MARKPOINT:
5472             ST.prev_mark = mark_state;
5473             ST.mark_name = sv_commit = sv_yes_mark
5474                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5475             mark_state = st;
5476             ST.mark_loc = PL_reginput = locinput;
5477             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5478             /* NOTREACHED */
5479         case MARKPOINT_next:
5480             mark_state = ST.prev_mark;
5481             sayYES;
5482             /* NOTREACHED */
5483         case MARKPOINT_next_fail:
5484             if (popmark && sv_eq(ST.mark_name,popmark))
5485             {
5486                 if (ST.mark_loc > startpoint)
5487                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5488                 popmark = NULL; /* we found our mark */
5489                 sv_commit = ST.mark_name;
5490
5491                 DEBUG_EXECUTE_r({
5492                         PerlIO_printf(Perl_debug_log,
5493                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5494                             REPORT_CODE_OFF+depth*2, "",
5495                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5496                 });
5497             }
5498             mark_state = ST.prev_mark;
5499             sv_yes_mark = mark_state ?
5500                 mark_state->u.mark.mark_name : NULL;
5501             sayNO;
5502             /* NOTREACHED */
5503         case SKIP:
5504             PL_reginput = locinput;
5505             if (scan->flags) {
5506                 /* (*SKIP) : if we fail we cut here*/
5507                 ST.mark_name = NULL;
5508                 ST.mark_loc = locinput;
5509                 PUSH_STATE_GOTO(SKIP_next,next);
5510             } else {
5511                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5512                    otherwise do nothing.  Meaning we need to scan
5513                  */
5514                 regmatch_state *cur = mark_state;
5515                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5516
5517                 while (cur) {
5518                     if ( sv_eq( cur->u.mark.mark_name,
5519                                 find ) )
5520                     {
5521                         ST.mark_name = find;
5522                         PUSH_STATE_GOTO( SKIP_next, next );
5523                     }
5524                     cur = cur->u.mark.prev_mark;
5525                 }
5526             }
5527             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5528             break;
5529         case SKIP_next_fail:
5530             if (ST.mark_name) {
5531                 /* (*CUT:NAME) - Set up to search for the name as we
5532                    collapse the stack*/
5533                 popmark = ST.mark_name;
5534             } else {
5535                 /* (*CUT) - No name, we cut here.*/
5536                 if (ST.mark_loc > startpoint)
5537                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5538                 /* but we set sv_commit to latest mark_name if there
5539                    is one so they can test to see how things lead to this
5540                    cut */
5541                 if (mark_state)
5542                     sv_commit=mark_state->u.mark.mark_name;
5543             }
5544             no_final = 1;
5545             sayNO;
5546             /* NOTREACHED */
5547 #undef ST
5548         case FOLDCHAR:
5549             n = ARG(scan);
5550             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5551                 locinput += ln;
5552             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5553                 sayNO;
5554             } else  {
5555                 U8 folded[UTF8_MAXBYTES_CASE+1];
5556                 STRLEN foldlen;
5557                 const char * const l = locinput;
5558                 char *e = PL_regeol;
5559                 to_uni_fold(n, folded, &foldlen);
5560
5561                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5562                                l, &e, 0,  utf8_target)) {
5563                         sayNO;
5564                 }
5565                 locinput = e;
5566             }
5567             nextchr = UCHARAT(locinput);
5568             break;
5569         case LNBREAK:
5570             if ((n=is_LNBREAK(locinput,utf8_target))) {
5571                 locinput += n;
5572                 nextchr = UCHARAT(locinput);
5573             } else
5574                 sayNO;
5575             break;
5576
5577 #define CASE_CLASS(nAmE)                              \
5578         case nAmE:                                    \
5579             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5580                 locinput += n;                        \
5581                 nextchr = UCHARAT(locinput);          \
5582             } else                                    \
5583                 sayNO;                                \
5584             break;                                    \
5585         case N##nAmE:                                 \
5586             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5587                 sayNO;                                \
5588             } else {                                  \
5589                 locinput += UTF8SKIP(locinput);       \
5590                 nextchr = UCHARAT(locinput);          \
5591             }                                         \
5592             break
5593
5594         CASE_CLASS(VERTWS);
5595         CASE_CLASS(HORIZWS);
5596 #undef CASE_CLASS
5597
5598         default:
5599             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5600                           PTR2UV(scan), OP(scan));
5601             Perl_croak(aTHX_ "regexp memory corruption");
5602
5603         } /* end switch */
5604
5605         /* switch break jumps here */
5606         scan = next; /* prepare to execute the next op and ... */
5607         continue;    /* ... jump back to the top, reusing st */
5608         /* NOTREACHED */
5609
5610       push_yes_state:
5611         /* push a state that backtracks on success */
5612         st->u.yes.prev_yes_state = yes_state;
5613         yes_state = st;
5614         /* FALL THROUGH */
5615       push_state:
5616         /* push a new regex state, then continue at scan  */
5617         {
5618             regmatch_state *newst;
5619
5620             DEBUG_STACK_r({
5621                 regmatch_state *cur = st;
5622                 regmatch_state *curyes = yes_state;
5623                 int curd = depth;
5624                 regmatch_slab *slab = PL_regmatch_slab;
5625                 for (;curd > -1;cur--,curd--) {
5626                     if (cur < SLAB_FIRST(slab)) {
5627                         slab = slab->prev;
5628                         cur = SLAB_LAST(slab);
5629                     }
5630                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5631                         REPORT_CODE_OFF + 2 + depth * 2,"",
5632                         curd, PL_reg_name[cur->resume_state],
5633                         (curyes == cur) ? "yes" : ""
5634                     );
5635                     if (curyes == cur)
5636                         curyes = cur->u.yes.prev_yes_state;
5637                 }
5638             } else
5639                 DEBUG_STATE_pp("push")
5640             );
5641             depth++;
5642             st->locinput = locinput;
5643             newst = st+1;
5644             if (newst >  SLAB_LAST(PL_regmatch_slab))
5645                 newst = S_push_slab(aTHX);
5646             PL_regmatch_state = newst;
5647
5648             locinput = PL_reginput;
5649             nextchr = UCHARAT(locinput);
5650             st = newst;
5651             continue;
5652             /* NOTREACHED */
5653         }
5654     }
5655
5656     /*
5657     * We get here only if there's trouble -- normally "case END" is
5658     * the terminating point.
5659     */
5660     Perl_croak(aTHX_ "corrupted regexp pointers");
5661     /*NOTREACHED*/
5662     sayNO;
5663
5664 yes:
5665     if (yes_state) {
5666         /* we have successfully completed a subexpression, but we must now
5667          * pop to the state marked by yes_state and continue from there */
5668         assert(st != yes_state);
5669 #ifdef DEBUGGING
5670         while (st != yes_state) {
5671             st--;
5672             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5673                 PL_regmatch_slab = PL_regmatch_slab->prev;
5674                 st = SLAB_LAST(PL_regmatch_slab);
5675             }
5676             DEBUG_STATE_r({
5677                 if (no_final) {
5678                     DEBUG_STATE_pp("pop (no final)");
5679                 } else {
5680                     DEBUG_STATE_pp("pop (yes)");
5681                 }
5682             });
5683             depth--;
5684         }
5685 #else
5686         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5687             || yes_state > SLAB_LAST(PL_regmatch_slab))
5688         {
5689             /* not in this slab, pop slab */
5690             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5691             PL_regmatch_slab = PL_regmatch_slab->prev;
5692             st = SLAB_LAST(PL_regmatch_slab);
5693         }
5694         depth -= (st - yes_state);
5695 #endif
5696         st = yes_state;
5697         yes_state = st->u.yes.prev_yes_state;
5698         PL_regmatch_state = st;
5699
5700         if (no_final) {
5701             locinput= st->locinput;
5702             nextchr = UCHARAT(locinput);
5703         }
5704         state_num = st->resume_state + no_final;
5705         goto reenter_switch;
5706     }
5707
5708     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5709                           PL_colors[4], PL_colors[5]));
5710
5711     if (PL_reg_eval_set) {
5712         /* each successfully executed (?{...}) block does the equivalent of
5713          *   local $^R = do {...}
5714          * When popping the save stack, all these locals would be undone;
5715          * bypass this by setting the outermost saved $^R to the latest
5716          * value */
5717         if (oreplsv != GvSV(PL_replgv))
5718             sv_setsv(oreplsv, GvSV(PL_replgv));
5719     }
5720     result = 1;
5721     goto final_exit;
5722
5723 no:
5724     DEBUG_EXECUTE_r(
5725         PerlIO_printf(Perl_debug_log,
5726             "%*s  %sfailed...%s\n",
5727             REPORT_CODE_OFF+depth*2, "",
5728             PL_colors[4], PL_colors[5])
5729         );
5730
5731 no_silent:
5732     if (no_final) {
5733         if (yes_state) {
5734             goto yes;
5735         } else {
5736             goto final_exit;
5737         }
5738     }
5739     if (depth) {
5740         /* there's a previous state to backtrack to */
5741         st--;
5742         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5743             PL_regmatch_slab = PL_regmatch_slab->prev;
5744             st = SLAB_LAST(PL_regmatch_slab);
5745         }
5746         PL_regmatch_state = st;
5747         locinput= st->locinput;
5748         nextchr = UCHARAT(locinput);
5749
5750         DEBUG_STATE_pp("pop");
5751         depth--;
5752         if (yes_state == st)
5753             yes_state = st->u.yes.prev_yes_state;
5754
5755         state_num = st->resume_state + 1; /* failure = success + 1 */
5756         goto reenter_switch;
5757     }
5758     result = 0;
5759
5760   final_exit:
5761     if (rex->intflags & PREGf_VERBARG_SEEN) {
5762         SV *sv_err = get_sv("REGERROR", 1);
5763         SV *sv_mrk = get_sv("REGMARK", 1);
5764         if (result) {
5765             sv_commit = &PL_sv_no;
5766             if (!sv_yes_mark)
5767                 sv_yes_mark = &PL_sv_yes;
5768         } else {
5769             if (!sv_commit)
5770                 sv_commit = &PL_sv_yes;
5771             sv_yes_mark = &PL_sv_no;
5772         }
5773         sv_setsv(sv_err, sv_commit);
5774         sv_setsv(sv_mrk, sv_yes_mark);
5775     }
5776
5777     /* clean up; in particular, free all slabs above current one */
5778     LEAVE_SCOPE(oldsave);
5779
5780     return result;
5781 }
5782
5783 /*
5784  - regrepeat - repeatedly match something simple, report how many
5785  */
5786 /*
5787  * [This routine now assumes that it will only match on things of length 1.
5788  * That was true before, but now we assume scan - reginput is the count,
5789  * rather than incrementing count on every character.  [Er, except utf8.]]
5790  */
5791 STATIC I32
5792 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5793 {
5794     dVAR;
5795     register char *scan;
5796     register I32 c;
5797     register char *loceol = PL_regeol;
5798     register I32 hardcount = 0;
5799     register bool utf8_target = PL_reg_match_utf8;
5800 #ifndef DEBUGGING
5801     PERL_UNUSED_ARG(depth);
5802 #endif
5803
5804     PERL_ARGS_ASSERT_REGREPEAT;
5805
5806     scan = PL_reginput;
5807     if (max == REG_INFTY)
5808         max = I32_MAX;
5809     else if (max < loceol - scan)
5810         loceol = scan + max;
5811     switch (OP(p)) {
5812     case REG_ANY:
5813         if (utf8_target) {
5814             loceol = PL_regeol;
5815             while (scan < loceol && hardcount < max && *scan != '\n') {
5816                 scan += UTF8SKIP(scan);
5817                 hardcount++;
5818             }
5819         } else {
5820             while (scan < loceol && *scan != '\n')
5821                 scan++;
5822         }
5823         break;
5824     case SANY:
5825         if (utf8_target) {
5826             loceol = PL_regeol;
5827             while (scan < loceol && hardcount < max) {
5828                 scan += UTF8SKIP(scan);
5829                 hardcount++;
5830             }
5831         }
5832         else
5833             scan = loceol;
5834         break;
5835     case CANY:
5836         scan = loceol;
5837         break;
5838     case EXACT:
5839         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5840          * means they match only characters in the string that can be expressed
5841          * as a single byte.  For non-utf8 strings, that means a simple match.
5842          * For utf8 strings, the character matched must be an invariant, or
5843          * downgradable to a single byte.  The pattern's utf8ness is
5844          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5845          * it is, it's an invariant */
5846
5847         c = (U8)*STRING(p);
5848         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5849
5850         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5851             while (scan < loceol && UCHARAT(scan) == c) {
5852                 scan++;
5853             }
5854         }
5855         else {
5856
5857             /* Here, the string is utf8, and the pattern char is different
5858              * in utf8 than not, so can't compare them directly.  Outside the
5859              * loop, find find the two utf8 bytes that represent c, and then
5860              * look for those in sequence in the utf8 string */
5861             U8 high = UTF8_TWO_BYTE_HI(c);
5862             U8 low = UTF8_TWO_BYTE_LO(c);
5863             loceol = PL_regeol;
5864
5865             while (hardcount < max
5866                     && scan + 1 < loceol
5867                     && UCHARAT(scan) == high
5868                     && UCHARAT(scan + 1) == low)
5869             {
5870                 scan += 2;
5871                 hardcount++;
5872             }
5873         }
5874         break;
5875     case EXACTFL:
5876         PL_reg_flags |= RF_tainted;
5877         /* FALL THROUGH */
5878     case EXACTF:
5879     case EXACTFU:
5880
5881         /* The comments for the EXACT case above apply as well to these fold
5882          * ones */
5883
5884         c = (U8)*STRING(p);
5885         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5886
5887         if (utf8_target) { /* Use full Unicode fold matching */
5888
5889             /* For the EXACTFL case, It doesn't really make sense to compare
5890              * locale and utf8, but it is best we can do.  The documents warn
5891              * against mixing them */
5892
5893             char *tmpeol = loceol;
5894             while (hardcount < max
5895                     && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
5896                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN)))
5897             {
5898                 scan = tmpeol;
5899                 tmpeol = loceol;
5900                 hardcount++;
5901             }
5902
5903             /* XXX Note that the above handles properly the German sharp s in
5904              * the pattern matching ss in the string.  But it doesn't handle
5905              * properly cases where the string contains say 'LIGATURE ff' and
5906              * the pattern is 'f+'.  This would require, say, a new function or
5907              * revised interface to foldEQ_utf8(), in which the maximum number
5908              * of characters to match could be passed and it would return how
5909              * many actually did.  This is just one of many cases where
5910              * multi-char folds don't work properly, and so the fix is being
5911              * deferred */
5912         }
5913         else {
5914             U8 folded;
5915
5916             /* Here, the string isn't utf8 and c is a single byte; and either
5917              * the pattern isn't utf8 or c is an invariant, so its utf8ness
5918              * doesn't affect c.  Can just do simple comparisons for exact or
5919              * fold matching. */
5920             switch (OP(p)) {
5921                 case EXACTF: folded = PL_fold[c]; break;
5922                 case EXACTFU: folded = PL_fold_latin1[c]; break;
5923                 case EXACTFL: folded = PL_fold_locale[c]; break;
5924                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
5925             }
5926             while (scan < loceol &&
5927                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
5928             {
5929                 scan++;
5930             }
5931         }
5932         break;
5933     case ANYOF:
5934         if (utf8_target) {
5935             loceol = PL_regeol;
5936             while (hardcount < max && scan < loceol &&
5937                    reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
5938                 scan += UTF8SKIP(scan);
5939                 hardcount++;
5940             }
5941         } else {
5942             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5943                 scan++;
5944         }
5945         break;
5946     case ALNUM:
5947         if (utf8_target) {
5948             loceol = PL_regeol;
5949             LOAD_UTF8_CHARCLASS_ALNUM();
5950             while (hardcount < max && scan < loceol &&
5951                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5952             {
5953                 scan += UTF8SKIP(scan);
5954                 hardcount++;
5955             }
5956         } else if (FLAGS(p) & USE_UNI) {
5957             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
5958                 scan++;
5959             }
5960         } else {
5961             while (scan < loceol && isALNUM((U8) *scan)) {
5962                 scan++;
5963             }
5964         }
5965         break;
5966     case ALNUML:
5967         PL_reg_flags |= RF_tainted;
5968         if (utf8_target) {
5969             loceol = PL_regeol;
5970             while (hardcount < max && scan < loceol &&
5971                    isALNUM_LC_utf8((U8*)scan)) {
5972                 scan += UTF8SKIP(scan);
5973                 hardcount++;
5974             }
5975         } else {
5976             while (scan < loceol && isALNUM_LC(*scan))
5977                 scan++;
5978         }
5979         break;
5980     case NALNUM:
5981         if (utf8_target) {
5982             loceol = PL_regeol;
5983             LOAD_UTF8_CHARCLASS_ALNUM();
5984             while (hardcount < max && scan < loceol &&
5985                    !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5986             {
5987                 scan += UTF8SKIP(scan);
5988                 hardcount++;
5989             }
5990         } else if (FLAGS(p) & USE_UNI) {
5991             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
5992                 scan++;
5993             }
5994         } else {
5995             while (scan < loceol && ! isALNUM((U8) *scan)) {
5996                 scan++;
5997             }
5998         }
5999         break;
6000     case NALNUML:
6001         PL_reg_flags |= RF_tainted;
6002         if (utf8_target) {
6003             loceol = PL_regeol;
6004             while (hardcount < max && scan < loceol &&
6005                    !isALNUM_LC_utf8((U8*)scan)) {
6006                 scan += UTF8SKIP(scan);
6007                 hardcount++;
6008             }
6009         } else {
6010             while (scan < loceol && !isALNUM_LC(*scan))
6011                 scan++;
6012         }
6013         break;
6014     case SPACE:
6015         if (utf8_target) {
6016             loceol = PL_regeol;
6017             LOAD_UTF8_CHARCLASS_SPACE();
6018             while (hardcount < max && scan < loceol &&
6019                    (*scan == ' ' ||
6020                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6021             {
6022                 scan += UTF8SKIP(scan);
6023                 hardcount++;
6024             }
6025         } else if (FLAGS(p) & USE_UNI) {
6026             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6027                 scan++;
6028             }
6029         } else {
6030             while (scan < loceol && isSPACE((U8) *scan))
6031                 scan++;
6032         }
6033         break;
6034     case SPACEL:
6035         PL_reg_flags |= RF_tainted;
6036         if (utf8_target) {
6037             loceol = PL_regeol;
6038             while (hardcount < max && scan < loceol &&
6039                    isSPACE_LC_utf8((U8*)scan)) {
6040                 scan += UTF8SKIP(scan);
6041                 hardcount++;
6042             }
6043         } else {
6044             while (scan < loceol && isSPACE_LC(*scan))
6045                 scan++;
6046         }
6047         break;
6048     case NSPACE:
6049         if (utf8_target) {
6050             loceol = PL_regeol;
6051             LOAD_UTF8_CHARCLASS_SPACE();
6052             while (hardcount < max && scan < loceol &&
6053                    !(*scan == ' ' ||
6054                      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6055             {
6056                 scan += UTF8SKIP(scan);
6057                 hardcount++;
6058             }
6059         } else if (FLAGS(p) & USE_UNI) {
6060             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6061                 scan++;
6062             }
6063         } else {
6064             while (scan < loceol && ! isSPACE((U8) *scan)) {
6065                 scan++;
6066             }
6067         }
6068         break;
6069     case NSPACEL:
6070         PL_reg_flags |= RF_tainted;
6071         if (utf8_target) {
6072             loceol = PL_regeol;
6073             while (hardcount < max && scan < loceol &&
6074                    !isSPACE_LC_utf8((U8*)scan)) {
6075                 scan += UTF8SKIP(scan);
6076                 hardcount++;
6077             }
6078         } else {
6079             while (scan < loceol && !isSPACE_LC(*scan))
6080                 scan++;
6081         }
6082         break;
6083     case DIGIT:
6084         if (utf8_target) {
6085             loceol = PL_regeol;
6086             LOAD_UTF8_CHARCLASS_DIGIT();
6087             while (hardcount < max && scan < loceol &&
6088                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6089                 scan += UTF8SKIP(scan);
6090                 hardcount++;
6091             }
6092         } else {
6093             while (scan < loceol && isDIGIT(*scan))
6094                 scan++;
6095         }
6096         break;
6097     case DIGITL:
6098         PL_reg_flags |= RF_tainted;
6099         if (utf8_target) {
6100             loceol = PL_regeol;
6101             while (hardcount < max && scan < loceol &&
6102                    isDIGIT_LC_utf8((U8*)scan)) {
6103                 scan += UTF8SKIP(scan);
6104                 hardcount++;
6105             }
6106         } else {
6107             while (scan < loceol && isDIGIT_LC(*scan))
6108                 scan++;
6109         }
6110         break;
6111     case NDIGIT:
6112         if (utf8_target) {
6113             loceol = PL_regeol;
6114             LOAD_UTF8_CHARCLASS_DIGIT();
6115             while (hardcount < max && scan < loceol &&
6116                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6117                 scan += UTF8SKIP(scan);
6118                 hardcount++;
6119             }
6120         } else {
6121             while (scan < loceol && !isDIGIT(*scan))
6122                 scan++;
6123         }
6124     case NDIGITL:
6125         PL_reg_flags |= RF_tainted;
6126         if (utf8_target) {
6127             loceol = PL_regeol;
6128             while (hardcount < max && scan < loceol &&
6129                    !isDIGIT_LC_utf8((U8*)scan)) {
6130                 scan += UTF8SKIP(scan);
6131                 hardcount++;
6132             }
6133         } else {
6134             while (scan < loceol && !isDIGIT_LC(*scan))
6135                 scan++;
6136         }
6137         break;
6138     case LNBREAK:
6139         if (utf8_target) {
6140             loceol = PL_regeol;
6141             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6142                 scan += c;
6143                 hardcount++;
6144             }
6145         } else {
6146             /*
6147               LNBREAK can match two latin chars, which is ok,
6148               because we have a null terminated string, but we
6149               have to use hardcount in this situation
6150             */
6151             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6152                 scan+=c;
6153                 hardcount++;
6154             }
6155         }
6156         break;
6157     case HORIZWS:
6158         if (utf8_target) {
6159             loceol = PL_regeol;
6160             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6161                 scan += c;
6162                 hardcount++;
6163             }
6164         } else {
6165             while (scan < loceol && is_HORIZWS_latin1(scan))
6166                 scan++;
6167         }
6168         break;
6169     case NHORIZWS:
6170         if (utf8_target) {
6171             loceol = PL_regeol;
6172             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6173                 scan += UTF8SKIP(scan);
6174                 hardcount++;
6175             }
6176         } else {
6177             while (scan < loceol && !is_HORIZWS_latin1(scan))
6178                 scan++;
6179
6180         }
6181         break;
6182     case VERTWS:
6183         if (utf8_target) {
6184             loceol = PL_regeol;
6185             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6186                 scan += c;
6187                 hardcount++;
6188             }
6189         } else {
6190             while (scan < loceol && is_VERTWS_latin1(scan))
6191                 scan++;
6192
6193         }
6194         break;
6195     case NVERTWS:
6196         if (utf8_target) {
6197             loceol = PL_regeol;
6198             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6199                 scan += UTF8SKIP(scan);
6200                 hardcount++;
6201             }
6202         } else {
6203             while (scan < loceol && !is_VERTWS_latin1(scan))
6204                 scan++;
6205
6206         }
6207         break;
6208
6209     default:            /* Called on something of 0 width. */
6210         break;          /* So match right here or not at all. */
6211     }
6212
6213     if (hardcount)
6214         c = hardcount;
6215     else
6216         c = scan - PL_reginput;
6217     PL_reginput = scan;
6218
6219     DEBUG_r({
6220         GET_RE_DEBUG_FLAGS_DECL;
6221         DEBUG_EXECUTE_r({
6222             SV * const prop = sv_newmortal();
6223             regprop(prog, prop, p);
6224             PerlIO_printf(Perl_debug_log,
6225                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6226                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6227         });
6228     });
6229
6230     return(c);
6231 }
6232
6233
6234 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6235 /*
6236 - regclass_swash - prepare the utf8 swash
6237 */
6238
6239 SV *
6240 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6241 {
6242     dVAR;
6243     SV *sw  = NULL;
6244     SV *si  = NULL;
6245     SV *alt = NULL;
6246     RXi_GET_DECL(prog,progi);
6247     const struct reg_data * const data = prog ? progi->data : NULL;
6248
6249     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6250
6251     if (data && data->count) {
6252         const U32 n = ARG(node);
6253
6254         if (data->what[n] == 's') {
6255             SV * const rv = MUTABLE_SV(data->data[n]);
6256             AV * const av = MUTABLE_AV(SvRV(rv));
6257             SV **const ary = AvARRAY(av);
6258             SV **a, **b;
6259
6260             /* See the end of regcomp.c:S_regclass() for
6261              * documentation of these array elements. */
6262
6263             si = *ary;
6264             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6265             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6266
6267             if (a)
6268                 sw = *a;
6269             else if (si && doinit) {
6270                 sw = swash_init("utf8", "", si, 1, 0);
6271                 (void)av_store(av, 1, sw);
6272             }
6273             if (b)
6274                 alt = *b;
6275         }
6276     }
6277
6278     if (listsvp)
6279         *listsvp = si;
6280     if (altsvp)
6281         *altsvp  = alt;
6282
6283     return sw;
6284 }
6285 #endif
6286
6287 /*
6288  - reginclass - determine if a character falls into a character class
6289
6290   n is the ANYOF regnode
6291   p is the target string
6292   lenp is pointer to the maximum number of bytes of how far to go in p
6293     (This is assumed wthout checking to always be at least the current
6294     character's size)
6295   utf8_target tells whether p is in UTF-8.
6296
6297   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6298   from a successful match, the value it points to will be updated to how many
6299   bytes in p were matched.  If there was no match, the value is undefined,
6300   possibly changed from the input.
6301
6302  */
6303
6304 STATIC bool
6305 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6306 {
6307     dVAR;
6308     const char flags = ANYOF_FLAGS(n);
6309     bool match = FALSE;
6310     UV c = *p;
6311     STRLEN c_len = 0;
6312     STRLEN maxlen;
6313
6314     PERL_ARGS_ASSERT_REGINCLASS;
6315
6316     /* If c is not already the code point, get it */
6317     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6318         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6319                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6320                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6321                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6322                  * UTF8_ALLOW_FFFF */
6323         if (c_len == (STRLEN)-1)
6324             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6325     }
6326     else {
6327         c_len = 1;
6328     }
6329
6330     /* Use passed in max length, or one character if none passed in or less
6331      * than one character.  And assume will match just one character.  This is
6332      * overwritten later if matched more. */
6333     if (lenp) {
6334         maxlen = (*lenp > c_len) ? *lenp : c_len;
6335         *lenp = c_len;
6336
6337     }
6338     else {
6339         maxlen = c_len;
6340     }
6341
6342     /* If this character is potentially in the bitmap, check it */
6343     if (c < 256) {
6344         if (ANYOF_BITMAP_TEST(n, c))
6345             match = TRUE;
6346         else if (flags & ANYOF_FOLD) {
6347             U8 f;
6348
6349             if (flags & ANYOF_LOCALE) {
6350                 PL_reg_flags |= RF_tainted;
6351                 f = PL_fold_locale[c];
6352             }
6353             else
6354                 f = PL_fold[c];
6355             if (f != c && ANYOF_BITMAP_TEST(n, f))
6356                 match = TRUE;
6357         }
6358
6359         if (!match && ANYOF_CLASS_TEST_ANY_SET(n)) {
6360             PL_reg_flags |= RF_tainted;     /* CLASS implies LOCALE */
6361             if (
6362                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6363                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6364                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6365                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6366                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6367                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6368                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6369                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6370                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6371                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6372                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6373                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6374                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6375                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6376                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6377                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6378                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6379                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6380                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6381                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6382                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6383                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6384                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6385                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6386                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6387                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6388                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6389                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6390                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6391                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6392                 ) /* How's that for a conditional? */
6393             {
6394                 match = TRUE;
6395             }
6396         }
6397     }
6398
6399     /* If the bitmap didn't (or couldn't) match, and something outside the
6400      * bitmap could match, try that */
6401     if (!match) {
6402         if (utf8_target && (flags & ANYOF_UNICODE_ALL)) {
6403             if (c >= 256
6404                 || ((flags & ANYOF_FOLD) /* Latin1 1 that has a non-Latin1 fold
6405                                             should match */
6406                     && _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c)))
6407             {
6408                 match = TRUE;
6409             }
6410         }
6411         if (!match && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6412                        || (utf8_target && flags & ANYOF_UTF8)))
6413         {
6414             AV *av;
6415             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6416
6417             if (sw) {
6418                 U8 * utf8_p;
6419                 if (utf8_target) {
6420                     utf8_p = (U8 *) p;
6421                 } else {
6422                     STRLEN len = 1;
6423                     utf8_p = bytes_to_utf8(p, &len);
6424                 }
6425                 if (swash_fetch(sw, utf8_p, 1))
6426                     match = TRUE;
6427                 else if (flags & ANYOF_FOLD) {
6428                     if (!match && lenp && av) {
6429                         I32 i;
6430                         for (i = 0; i <= av_len(av); i++) {
6431                             SV* const sv = *av_fetch(av, i, FALSE);
6432                             STRLEN len;
6433                             const char * const s = SvPV_const(sv, len);
6434                             if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) {
6435                                 *lenp = len;
6436                                 match = TRUE;
6437                                 break;
6438                             }
6439                         }
6440                     }
6441                     if (!match) { /* See if the folded version matches */
6442                         U8 folded[UTF8_MAXBYTES_CASE+1];
6443                         SV** listp;
6444                         STRLEN foldlen;
6445
6446                         to_utf8_fold(utf8_p, folded, &foldlen);
6447
6448                         /* Consider "k" =~ /[K]/i.  The line above would have
6449                          * just folded the 'k' to itself, and that isn't going
6450                          * to match 'K'.  So we look through the closure of
6451                          * everything that folds to 'k'.  That will find the
6452                          * 'K'.  Initialize the list, if necessary */
6453                         if (! PL_utf8_foldclosures) {
6454
6455                             /* If the folds haven't been read in, call a fold
6456                              * function to force that */
6457                             if (! PL_utf8_tofold) {
6458                                 U8 dummy[UTF8_MAXBYTES+1];
6459                                 STRLEN dummy_len;
6460                                 to_utf8_fold((U8*) "A", dummy, &dummy_len);
6461                             }
6462                             PL_utf8_foldclosures =
6463                                   _swash_inversion_hash(PL_utf8_tofold);
6464                         }
6465
6466                         /* The data structure is a hash with the keys every
6467                          * character that is folded to, like 'k', and the
6468                          * values each an array of everything that folds to its
6469                          * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
6470                         if ((listp = hv_fetch(PL_utf8_foldclosures,
6471                                       (char *) folded, foldlen, FALSE)))
6472                         {
6473                             AV* list = (AV*) *listp;
6474                             IV i;
6475                             for (i = 0; i <= av_len(list); i++) {
6476                                 SV** try_p = av_fetch(list, i, FALSE);
6477                                 char* try_c;
6478                                 if (try_p == NULL) {
6479                                     Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
6480                                 }
6481                                 /* Don't have to worry about embeded nulls
6482                                  * since NULL isn't folded or foldable */
6483                                 try_c = SvPVX(*try_p);
6484
6485                                 /* The fold in a few cases  of an above Latin1
6486                                  * char is in the Latin1 range, and hence may
6487                                  * be in the bitmap */
6488                                 if (UTF8_IS_INVARIANT(*try_c)
6489                                     && ANYOF_BITMAP_TEST(n,
6490                                                     UNI_TO_NATIVE(*try_c)))
6491                                 {
6492                                     match = TRUE;
6493                                     break;
6494                                 }
6495                                 else if
6496                                     (UTF8_IS_DOWNGRADEABLE_START(*try_c)
6497                                      && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(
6498                                                 TWO_BYTE_UTF8_TO_UNI(try_c[0],
6499                                                                     try_c[1]))))
6500                                 {
6501                                    /* Since the fold comes from internally
6502                                     * generated data, we can safely assume it
6503                                     * is valid utf8 in the test above */
6504                                     match = TRUE;
6505                                     break;
6506                                 } else if (swash_fetch(sw, (U8*) try_c, 1)) {
6507                                     match = TRUE;
6508                                     break;
6509                                 }
6510                             }
6511                         }
6512                     }
6513                 }
6514
6515                 /* If we allocated a string above, free it */
6516                 if (! utf8_target) Safefree(utf8_p);
6517             }
6518         }
6519     }
6520
6521     return (flags & ANYOF_INVERT) ? !match : match;
6522 }
6523
6524 STATIC U8 *
6525 S_reghop3(U8 *s, I32 off, const U8* lim)
6526 {
6527     dVAR;
6528
6529     PERL_ARGS_ASSERT_REGHOP3;
6530
6531     if (off >= 0) {
6532         while (off-- && s < lim) {
6533             /* XXX could check well-formedness here */
6534             s += UTF8SKIP(s);
6535         }
6536     }
6537     else {
6538         while (off++ && s > lim) {
6539             s--;
6540             if (UTF8_IS_CONTINUED(*s)) {
6541                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6542                     s--;
6543             }
6544             /* XXX could check well-formedness here */
6545         }
6546     }
6547     return s;
6548 }
6549
6550 #ifdef XXX_dmq
6551 /* there are a bunch of places where we use two reghop3's that should
6552    be replaced with this routine. but since thats not done yet
6553    we ifdef it out - dmq
6554 */
6555 STATIC U8 *
6556 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6557 {
6558     dVAR;
6559
6560     PERL_ARGS_ASSERT_REGHOP4;
6561
6562     if (off >= 0) {
6563         while (off-- && s < rlim) {
6564             /* XXX could check well-formedness here */
6565             s += UTF8SKIP(s);
6566         }
6567     }
6568     else {
6569         while (off++ && s > llim) {
6570             s--;
6571             if (UTF8_IS_CONTINUED(*s)) {
6572                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6573                     s--;
6574             }
6575             /* XXX could check well-formedness here */
6576         }
6577     }
6578     return s;
6579 }
6580 #endif
6581
6582 STATIC U8 *
6583 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6584 {
6585     dVAR;
6586
6587     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6588
6589     if (off >= 0) {
6590         while (off-- && s < lim) {
6591             /* XXX could check well-formedness here */
6592             s += UTF8SKIP(s);
6593         }
6594         if (off >= 0)
6595             return NULL;
6596     }
6597     else {
6598         while (off++ && s > lim) {
6599             s--;
6600             if (UTF8_IS_CONTINUED(*s)) {
6601                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6602                     s--;
6603             }
6604             /* XXX could check well-formedness here */
6605         }
6606         if (off <= 0)
6607             return NULL;
6608     }
6609     return s;
6610 }
6611
6612 static void
6613 restore_pos(pTHX_ void *arg)
6614 {
6615     dVAR;
6616     regexp * const rex = (regexp *)arg;
6617     if (PL_reg_eval_set) {
6618         if (PL_reg_oldsaved) {
6619             rex->subbeg = PL_reg_oldsaved;
6620             rex->sublen = PL_reg_oldsavedlen;
6621 #ifdef PERL_OLD_COPY_ON_WRITE
6622             rex->saved_copy = PL_nrs;
6623 #endif
6624             RXp_MATCH_COPIED_on(rex);
6625         }
6626         PL_reg_magic->mg_len = PL_reg_oldpos;
6627         PL_reg_eval_set = 0;
6628         PL_curpm = PL_reg_oldcurpm;
6629     }
6630 }
6631
6632 STATIC void
6633 S_to_utf8_substr(pTHX_ register regexp *prog)
6634 {
6635     int i = 1;
6636
6637     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6638
6639     do {
6640         if (prog->substrs->data[i].substr
6641             && !prog->substrs->data[i].utf8_substr) {
6642             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6643             prog->substrs->data[i].utf8_substr = sv;
6644             sv_utf8_upgrade(sv);
6645             if (SvVALID(prog->substrs->data[i].substr)) {
6646                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6647                 if (flags & FBMcf_TAIL) {
6648                     /* Trim the trailing \n that fbm_compile added last
6649                        time.  */
6650                     SvCUR_set(sv, SvCUR(sv) - 1);
6651                     /* Whilst this makes the SV technically "invalid" (as its
6652                        buffer is no longer followed by "\0") when fbm_compile()
6653                        adds the "\n" back, a "\0" is restored.  */
6654                 }
6655                 fbm_compile(sv, flags);
6656             }
6657             if (prog->substrs->data[i].substr == prog->check_substr)
6658                 prog->check_utf8 = sv;
6659         }
6660     } while (i--);
6661 }
6662
6663 STATIC void
6664 S_to_byte_substr(pTHX_ register regexp *prog)
6665 {
6666     dVAR;
6667     int i = 1;
6668
6669     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6670
6671     do {
6672         if (prog->substrs->data[i].utf8_substr
6673             && !prog->substrs->data[i].substr) {
6674             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6675             if (sv_utf8_downgrade(sv, TRUE)) {
6676                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6677                     const U8 flags
6678                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6679                     if (flags & FBMcf_TAIL) {
6680                         /* Trim the trailing \n that fbm_compile added last
6681                            time.  */
6682                         SvCUR_set(sv, SvCUR(sv) - 1);
6683                     }
6684                     fbm_compile(sv, flags);
6685                 }
6686             } else {
6687                 SvREFCNT_dec(sv);
6688                 sv = &PL_sv_undef;
6689             }
6690             prog->substrs->data[i].substr = sv;
6691             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6692                 prog->check_substr = sv;
6693         }
6694     } while (i--);
6695 }
6696
6697 /*
6698  * Local variables:
6699  * c-indentation-style: bsd
6700  * c-basic-offset: 4
6701  * indent-tabs-mode: t
6702  * End:
6703  *
6704  * ex: set ts=8 sts=4 sw=4 noet:
6705  */