orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1               /* tainted information used? */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 /* Valid for non-utf8 strings only: avoids the reginclass call if there are no
  98  * complications: i.e., if everything matchable is straight forward in the
  99  * bitmap */
 100 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 101                                               : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111         (char *)(PL_reg_match_utf8 \
 112             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113             : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115         (char*)(PL_reg_match_utf8\
 116             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117             : (pos - off >= PL_bostr)           \
 118                 ? (U8*)pos - off                \
 119                 : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140             * assert should likely and hopefully fail on an EBCDIC machine */ \
 141         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                                                                             \
 143         /* No asserts are done for these, in case called on an early        \
 144             * Unicode version in which they map to nothing */               \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 /*
 154    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 155    so that it is possible to override the option here without having to
 156    rebuild the entire core. as we are required to do if we change regcomp.h
 157    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 158 */
 159 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 160 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 161 #endif
 162
 163 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 164 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 165 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 166 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 167 #define RE_utf8_perl_word   PL_utf8_alnum
 168 #define RE_utf8_perl_space  PL_utf8_space
 169 #define RE_utf8_posix_digit PL_utf8_digit
 170 #define perl_word  alnum
 171 #define perl_space space
 172 #define posix_digit digit
 173 #else
 174 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 175 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 176 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 177 #define RE_utf8_perl_word   PL_utf8_perl_word
 178 #define RE_utf8_perl_space  PL_utf8_perl_space
 179 #define RE_utf8_posix_digit PL_utf8_posix_digit
 180 #endif
 181
 182
 183 #define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
 184         case NAMEL:                                                         \
 185             PL_reg_flags |= RF_tainted;                                     \
 186             /* FALL THROUGH */                                              \
 187         case NAME:                                                          \
 188             if (!nextchr)                                                   \
 189                 sayNO;                                                      \
 190             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
 191                 if (!CAT2(PL_utf8_,CLASS)) {                                \
 192                     bool ok;                                                \
 193                     ENTER;                                                  \
 194                     save_re_context();                                      \
 195                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
 196                     assert(ok);                                             \
 197                     LEAVE;                                                  \
 198                 }                                                           \
 199                 if (!(OP(scan) == NAME                                      \
 200                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 201                     : LCFUNC_utf8((U8*)locinput)))                          \
 202                 {                                                           \
 203                     sayNO;                                                  \
 204                 }                                                           \
 205                 locinput += PL_utf8skip[nextchr];                           \
 206                 nextchr = UCHARAT(locinput);                                \
 207                 break;                                                      \
 208             }                                                               \
 209             /* Drops through to the macro that calls this one */
 210
 211 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
 212     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
 213             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
 214                 sayNO;                                                      \
 215             nextchr = UCHARAT(++locinput);                                  \
 216             break
 217
 218 /* Almost identical to the above, but has a case for a node that matches chars
 219  * between 128 and 255 using Unicode (latin1) semantics. */
 220 #define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 221     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 222             if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 223                 sayNO;                                                       \
 224             nextchr = UCHARAT(++locinput);                                   \
 225             break
 226
 227 #define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
 228         case NAMEL:                                                          \
 229             PL_reg_flags |= RF_tainted;                                      \
 230             /* FALL THROUGH */                                               \
 231         case NAME :                                                          \
 232             if (!nextchr && locinput >= PL_regeol)                           \
 233                 sayNO;                                                       \
 234             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
 235                 if (!CAT2(PL_utf8_,CLASS)) {                                 \
 236                     bool ok;                                                 \
 237                     ENTER;                                                   \
 238                     save_re_context();                                       \
 239                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
 240                     assert(ok);                                              \
 241                     LEAVE;                                                   \
 242                 }                                                            \
 243                 if ((OP(scan) == NAME                                        \
 244                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 245                     : LCFUNC_utf8((U8*)locinput)))                           \
 246                 {                                                            \
 247                     sayNO;                                                   \
 248                 }                                                            \
 249                 locinput += PL_utf8skip[nextchr];                            \
 250                 nextchr = UCHARAT(locinput);                                 \
 251                 break;                                                       \
 252             }
 253
 254 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
 255     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 256             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
 257                 sayNO;                                                       \
 258             nextchr = UCHARAT(++locinput);                                   \
 259             break
 260
 261
 262 #define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 263     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
 264             if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 265                 sayNO;                                                       \
 266             nextchr = UCHARAT(++locinput);                                   \
 267             break
 268
 269
 270
 271 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 272
 273 /* for use after a quantifier and before an EXACT-like node -- japhy */
 274 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 275  *
 276  * NOTE that *nothing* that affects backtracking should be in here, specifically
 277  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 278  * node that is in between two EXACT like nodes when ascertaining what the required
 279  * "follow" character is. This should probably be moved to regex compile time
 280  * although it may be done at run time beause of the REF possibility - more
 281  * investigation required. -- demerphq
 282 */
 283 #define JUMPABLE(rn) (      \
 284     OP(rn) == OPEN ||       \
 285     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 286     OP(rn) == EVAL ||   \
 287     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 288     OP(rn) == PLUS || OP(rn) == MINMOD || \
 289     OP(rn) == KEEPS || \
 290     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 291 )
 292 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 293
 294 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 295
 296 #if 0
 297 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 298    we don't need this definition. */
 299 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 300 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 301 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 302
 303 #else
 304 /* ... so we use this as its faster. */
 305 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 306 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 307 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 308
 309 #endif
 310
 311 /*
 312   Search for mandatory following text node; for lookahead, the text must
 313   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 314 */
 315 #define FIND_NEXT_IMPT(rn) STMT_START { \
 316     while (JUMPABLE(rn)) { \
 317         const OPCODE type = OP(rn); \
 318         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 319             rn = NEXTOPER(NEXTOPER(rn)); \
 320         else if (type == PLUS) \
 321             rn = NEXTOPER(rn); \
 322         else if (type == IFMATCH) \
 323             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 324         else rn += NEXT_OFF(rn); \
 325     } \
 326 } STMT_END
 327
 328
 329 static void restore_pos(pTHX_ void *arg);
 330
 331 #define REGCP_PAREN_ELEMS 4
 332 #define REGCP_OTHER_ELEMS 5
 333 #define REGCP_FRAME_ELEMS 1
 334 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 335  * are needed for the regexp context stack bookkeeping. */
 336
 337 STATIC CHECKPOINT
 338 S_regcppush(pTHX_ I32 parenfloor)
 339 {
 340     dVAR;
 341     const int retval = PL_savestack_ix;
 342     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 343     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 344     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 345     int p;
 346     GET_RE_DEBUG_FLAGS_DECL;
 347
 348     if (paren_elems_to_push < 0)
 349         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 350
 351     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 352         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 353                    " out of range (%lu-%ld)",
 354                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 355
 356     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 357
 358     for (p = PL_regsize; p > parenfloor; p--) {
 359 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 360         SSPUSHINT(PL_regoffs[p].end);
 361         SSPUSHINT(PL_regoffs[p].start);
 362         SSPUSHPTR(PL_reg_start_tmp[p]);
 363         SSPUSHINT(p);
 364         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 365           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 366                       (UV)p, (IV)PL_regoffs[p].start,
 367                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 368                       (IV)PL_regoffs[p].end
 369         ));
 370     }
 371 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 372     SSPUSHPTR(PL_regoffs);
 373     SSPUSHINT(PL_regsize);
 374     SSPUSHINT(*PL_reglastparen);
 375     SSPUSHINT(*PL_reglastcloseparen);
 376     SSPUSHPTR(PL_reginput);
 377     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 378
 379     return retval;
 380 }
 381
 382 /* These are needed since we do not localize EVAL nodes: */
 383 #define REGCP_SET(cp)                                           \
 384     DEBUG_STATE_r(                                              \
 385             PerlIO_printf(Perl_debug_log,                       \
 386                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 387                 (IV)PL_savestack_ix));                          \
 388     cp = PL_savestack_ix
 389
 390 #define REGCP_UNWIND(cp)                                        \
 391     DEBUG_STATE_r(                                              \
 392         if (cp != PL_savestack_ix)                              \
 393             PerlIO_printf(Perl_debug_log,                       \
 394                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 395                 (IV)(cp), (IV)PL_savestack_ix));                \
 396     regcpblow(cp)
 397
 398 STATIC char *
 399 S_regcppop(pTHX_ const regexp *rex)
 400 {
 401     dVAR;
 402     UV i;
 403     char *input;
 404     GET_RE_DEBUG_FLAGS_DECL;
 405
 406     PERL_ARGS_ASSERT_REGCPPOP;
 407
 408     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 409     i = SSPOPUV;
 410     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 411     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 412     input = (char *) SSPOPPTR;
 413     *PL_reglastcloseparen = SSPOPINT;
 414     *PL_reglastparen = SSPOPINT;
 415     PL_regsize = SSPOPINT;
 416     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 417
 418     i -= REGCP_OTHER_ELEMS;
 419     /* Now restore the parentheses context. */
 420     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 421         I32 tmps;
 422         U32 paren = (U32)SSPOPINT;
 423         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 424         PL_regoffs[paren].start = SSPOPINT;
 425         tmps = SSPOPINT;
 426         if (paren <= *PL_reglastparen)
 427             PL_regoffs[paren].end = tmps;
 428         DEBUG_BUFFERS_r(
 429             PerlIO_printf(Perl_debug_log,
 430                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 431                           (UV)paren, (IV)PL_regoffs[paren].start,
 432                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 433                           (IV)PL_regoffs[paren].end,
 434                           (paren > *PL_reglastparen ? "(no)" : ""));
 435         );
 436     }
 437     DEBUG_BUFFERS_r(
 438         if (*PL_reglastparen + 1 <= rex->nparens) {
 439             PerlIO_printf(Perl_debug_log,
 440                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 441                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 442         }
 443     );
 444 #if 1
 445     /* It would seem that the similar code in regtry()
 446      * already takes care of this, and in fact it is in
 447      * a better location to since this code can #if 0-ed out
 448      * but the code in regtry() is needed or otherwise tests
 449      * requiring null fields (pat.t#187 and split.t#{13,14}
 450      * (as of patchlevel 7877)  will fail.  Then again,
 451      * this code seems to be necessary or otherwise
 452      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 453      * --jhi updated by dapm */
 454     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 455         if (i > PL_regsize)
 456             PL_regoffs[i].start = -1;
 457         PL_regoffs[i].end = -1;
 458     }
 459 #endif
 460     return input;
 461 }
 462
 463 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 464
 465 /*
 466  * pregexec and friends
 467  */
 468
 469 #ifndef PERL_IN_XSUB_RE
 470 /*
 471  - pregexec - match a regexp against a string
 472  */
 473 I32
 474 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 475          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 476 /* strend: pointer to null at end of string */
 477 /* strbeg: real beginning of string */
 478 /* minend: end of match must be >=minend after stringarg. */
 479 /* nosave: For optimizations. */
 480 {
 481     PERL_ARGS_ASSERT_PREGEXEC;
 482
 483     return
 484         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 485                       nosave ? 0 : REXEC_COPY_STR);
 486 }
 487 #endif
 488
 489 /*
 490  * Need to implement the following flags for reg_anch:
 491  *
 492  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 493  * USE_INTUIT_ML
 494  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 495  * INTUIT_AUTORITATIVE_ML
 496  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 497  * INTUIT_ONCE_ML
 498  *
 499  * Another flag for this function: SECOND_TIME (so that float substrs
 500  * with giant delta may be not rechecked).
 501  */
 502
 503 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 504
 505 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 506    Otherwise, only SvCUR(sv) is used to get strbeg. */
 507
 508 /* XXXX We assume that strpos is strbeg unless sv. */
 509
 510 /* XXXX Some places assume that there is a fixed substring.
 511         An update may be needed if optimizer marks as "INTUITable"
 512         RExen without fixed substrings.  Similarly, it is assumed that
 513         lengths of all the strings are no more than minlen, thus they
 514         cannot come from lookahead.
 515         (Or minlen should take into account lookahead.)
 516   NOTE: Some of this comment is not correct. minlen does now take account
 517   of lookahead/behind. Further research is required. -- demerphq
 518
 519 */
 520
 521 /* A failure to find a constant substring means that there is no need to make
 522    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 523    finding a substring too deep into the string means that less calls to
 524    regtry() should be needed.
 525
 526    REx compiler's optimizer found 4 possible hints:
 527         a) Anchored substring;
 528         b) Fixed substring;
 529         c) Whether we are anchored (beginning-of-line or \G);
 530         d) First node (of those at offset 0) which may distingush positions;
 531    We use a)b)d) and multiline-part of c), and try to find a position in the
 532    string which does not contradict any of them.
 533  */
 534
 535 /* Most of decisions we do here should have been done at compile time.
 536    The nodes of the REx which we used for the search should have been
 537    deleted from the finite automaton. */
 538
 539 char *
 540 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 541                      char *strend, const U32 flags, re_scream_pos_data *data)
 542 {
 543     dVAR;
 544     struct regexp *const prog = (struct regexp *)SvANY(rx);
 545     register I32 start_shift = 0;
 546     /* Should be nonnegative! */
 547     register I32 end_shift   = 0;
 548     register char *s;
 549     register SV *check;
 550     char *strbeg;
 551     char *t;
 552     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 553     I32 ml_anch;
 554     register char *other_last = NULL;   /* other substr checked before this */
 555     char *check_at = NULL;              /* check substr found at this pos */
 556     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 557     RXi_GET_DECL(prog,progi);
 558 #ifdef DEBUGGING
 559     const char * const i_strpos = strpos;
 560 #endif
 561     GET_RE_DEBUG_FLAGS_DECL;
 562
 563     PERL_ARGS_ASSERT_RE_INTUIT_START;
 564
 565     RX_MATCH_UTF8_set(rx,utf8_target);
 566
 567     if (RX_UTF8(rx)) {
 568         PL_reg_flags |= RF_utf8;
 569     }
 570     DEBUG_EXECUTE_r(
 571         debug_start_match(rx, utf8_target, strpos, strend,
 572             sv ? "Guessing start of match in sv for"
 573                : "Guessing start of match in string for");
 574               );
 575
 576     /* CHR_DIST() would be more correct here but it makes things slow. */
 577     if (prog->minlen > strend - strpos) {
 578         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 579                               "String too short... [re_intuit_start]\n"));
 580         goto fail;
 581     }
 582
 583     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 584     PL_regeol = strend;
 585     if (utf8_target) {
 586         if (!prog->check_utf8 && prog->check_substr)
 587             to_utf8_substr(prog);
 588         check = prog->check_utf8;
 589     } else {
 590         if (!prog->check_substr && prog->check_utf8)
 591             to_byte_substr(prog);
 592         check = prog->check_substr;
 593     }
 594     if (check == &PL_sv_undef) {
 595         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 596                 "Non-utf8 string cannot match utf8 check string\n"));
 597         goto fail;
 598     }
 599     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 600         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 601                      || ( (prog->extflags & RXf_ANCH_BOL)
 602                           && !multiline ) );    /* Check after \n? */
 603
 604         if (!ml_anch) {
 605           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 606                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 607                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 608                && sv && !SvROK(sv)
 609                && (strpos != strbeg)) {
 610               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 611               goto fail;
 612           }
 613           if (prog->check_offset_min == prog->check_offset_max &&
 614               !(prog->extflags & RXf_CANY_SEEN)) {
 615             /* Substring at constant offset from beg-of-str... */
 616             I32 slen;
 617
 618             s = HOP3c(strpos, prog->check_offset_min, strend);
 619
 620             if (SvTAIL(check)) {
 621                 slen = SvCUR(check);    /* >= 1 */
 622
 623                 if ( strend - s > slen || strend - s < slen - 1
 624                      || (strend - s == slen && strend[-1] != '\n')) {
 625                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 626                     goto fail_finish;
 627                 }
 628                 /* Now should match s[0..slen-2] */
 629                 slen--;
 630                 if (slen && (*SvPVX_const(check) != *s
 631                              || (slen > 1
 632                                  && memNE(SvPVX_const(check), s, slen)))) {
 633                   report_neq:
 634                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 635                     goto fail_finish;
 636                 }
 637             }
 638             else if (*SvPVX_const(check) != *s
 639                      || ((slen = SvCUR(check)) > 1
 640                          && memNE(SvPVX_const(check), s, slen)))
 641                 goto report_neq;
 642             check_at = s;
 643             goto success_at_start;
 644           }
 645         }
 646         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 647         s = strpos;
 648         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 649         end_shift = prog->check_end_shift;
 650
 651         if (!ml_anch) {
 652             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 653                                          - (SvTAIL(check) != 0);
 654             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 655
 656             if (end_shift < eshift)
 657                 end_shift = eshift;
 658         }
 659     }
 660     else {                              /* Can match at random position */
 661         ml_anch = 0;
 662         s = strpos;
 663         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 664         end_shift = prog->check_end_shift;
 665
 666         /* end shift should be non negative here */
 667     }
 668
 669 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 670     if (end_shift < 0)
 671         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 672                    (IV)end_shift, RX_PRECOMP(prog));
 673 #endif
 674
 675   restart:
 676     /* Find a possible match in the region s..strend by looking for
 677        the "check" substring in the region corrected by start/end_shift. */
 678
 679     {
 680         I32 srch_start_shift = start_shift;
 681         I32 srch_end_shift = end_shift;
 682         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 683             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 684             srch_start_shift = strbeg - s;
 685         }
 686     DEBUG_OPTIMISE_MORE_r({
 687         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 688             (IV)prog->check_offset_min,
 689             (IV)srch_start_shift,
 690             (IV)srch_end_shift,
 691             (IV)prog->check_end_shift);
 692     });
 693
 694     if (flags & REXEC_SCREAM) {
 695         I32 p = -1;                     /* Internal iterator of scream. */
 696         I32 * const pp = data ? data->scream_pos : &p;
 697
 698         if (PL_screamfirst[BmRARE(check)] >= 0
 699             || ( BmRARE(check) == '\n'
 700                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 701                  && SvTAIL(check) ))
 702             s = screaminstr(sv, check,
 703                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 704         else
 705             goto fail_finish;
 706         /* we may be pointing at the wrong string */
 707         if (s && RXp_MATCH_COPIED(prog))
 708             s = strbeg + (s - SvPVX_const(sv));
 709         if (data)
 710             *data->scream_olds = s;
 711     }
 712     else {
 713         U8* start_point;
 714         U8* end_point;
 715         if (prog->extflags & RXf_CANY_SEEN) {
 716             start_point= (U8*)(s + srch_start_shift);
 717             end_point= (U8*)(strend - srch_end_shift);
 718         } else {
 719             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 720             end_point= HOP3(strend, -srch_end_shift, strbeg);
 721         }
 722         DEBUG_OPTIMISE_MORE_r({
 723             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 724                 (int)(end_point - start_point),
 725                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 726                 start_point);
 727         });
 728
 729         s = fbm_instr( start_point, end_point,
 730                       check, multiline ? FBMrf_MULTILINE : 0);
 731     }
 732     }
 733     /* Update the count-of-usability, remove useless subpatterns,
 734         unshift s.  */
 735
 736     DEBUG_EXECUTE_r({
 737         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 738             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 739         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 740                           (s ? "Found" : "Did not find"),
 741             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 742                 ? "anchored" : "floating"),
 743             quoted,
 744             RE_SV_TAIL(check),
 745             (s ? " at offset " : "...\n") );
 746     });
 747
 748     if (!s)
 749         goto fail_finish;
 750     /* Finish the diagnostic message */
 751     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 752
 753     /* XXX dmq: first branch is for positive lookbehind...
 754        Our check string is offset from the beginning of the pattern.
 755        So we need to do any stclass tests offset forward from that
 756        point. I think. :-(
 757      */
 758
 759
 760
 761     check_at=s;
 762
 763
 764     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 765        Start with the other substr.
 766        XXXX no SCREAM optimization yet - and a very coarse implementation
 767        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 768                 *always* match.  Probably should be marked during compile...
 769        Probably it is right to do no SCREAM here...
 770      */
 771
 772     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 773                 : (prog->float_substr && prog->anchored_substr))
 774     {
 775         /* Take into account the "other" substring. */
 776         /* XXXX May be hopelessly wrong for UTF... */
 777         if (!other_last)
 778             other_last = strpos;
 779         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 780           do_other_anchored:
 781             {
 782                 char * const last = HOP3c(s, -start_shift, strbeg);
 783                 char *last1, *last2;
 784                 char * const saved_s = s;
 785                 SV* must;
 786
 787                 t = s - prog->check_offset_max;
 788                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 789                     && (!utf8_target
 790                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 791                             && t > strpos)))
 792                     NOOP;
 793                 else
 794                     t = strpos;
 795                 t = HOP3c(t, prog->anchored_offset, strend);
 796                 if (t < other_last)     /* These positions already checked */
 797                     t = other_last;
 798                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 799                 if (last < last1)
 800                     last1 = last;
 801                 /* XXXX It is not documented what units *_offsets are in.
 802                    We assume bytes, but this is clearly wrong.
 803                    Meaning this code needs to be carefully reviewed for errors.
 804                    dmq.
 805                   */
 806
 807                 /* On end-of-str: see comment below. */
 808                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 809                 if (must == &PL_sv_undef) {
 810                     s = (char*)NULL;
 811                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 812                 }
 813                 else
 814                     s = fbm_instr(
 815                         (unsigned char*)t,
 816                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 817                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 818                         must,
 819                         multiline ? FBMrf_MULTILINE : 0
 820                     );
 821                 DEBUG_EXECUTE_r({
 822                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 823                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 824                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 825                         (s ? "Found" : "Contradicts"),
 826                         quoted, RE_SV_TAIL(must));
 827                 });
 828
 829
 830                 if (!s) {
 831                     if (last1 >= last2) {
 832                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 833                                                 ", giving up...\n"));
 834                         goto fail_finish;
 835                     }
 836                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 837                         ", trying floating at offset %ld...\n",
 838                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 839                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 840                     s = HOP3c(last, 1, strend);
 841                     goto restart;
 842                 }
 843                 else {
 844                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 845                           (long)(s - i_strpos)));
 846                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 847                     other_last = HOP3c(s, 1, strend);
 848                     s = saved_s;
 849                     if (t == strpos)
 850                         goto try_at_start;
 851                     goto try_at_offset;
 852                 }
 853             }
 854         }
 855         else {          /* Take into account the floating substring. */
 856             char *last, *last1;
 857             char * const saved_s = s;
 858             SV* must;
 859
 860             t = HOP3c(s, -start_shift, strbeg);
 861             last1 = last =
 862                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 863             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 864                 last = HOP3c(t, prog->float_max_offset, strend);
 865             s = HOP3c(t, prog->float_min_offset, strend);
 866             if (s < other_last)
 867                 s = other_last;
 868  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 869             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 870             /* fbm_instr() takes into account exact value of end-of-str
 871                if the check is SvTAIL(ed).  Since false positives are OK,
 872                and end-of-str is not later than strend we are OK. */
 873             if (must == &PL_sv_undef) {
 874                 s = (char*)NULL;
 875                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 876             }
 877             else
 878                 s = fbm_instr((unsigned char*)s,
 879                               (unsigned char*)last + SvCUR(must)
 880                                   - (SvTAIL(must)!=0),
 881                               must, multiline ? FBMrf_MULTILINE : 0);
 882             DEBUG_EXECUTE_r({
 883                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 884                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 885                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 886                     (s ? "Found" : "Contradicts"),
 887                     quoted, RE_SV_TAIL(must));
 888             });
 889             if (!s) {
 890                 if (last1 == last) {
 891                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 892                                             ", giving up...\n"));
 893                     goto fail_finish;
 894                 }
 895                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 896                     ", trying anchored starting at offset %ld...\n",
 897                     (long)(saved_s + 1 - i_strpos)));
 898                 other_last = last;
 899                 s = HOP3c(t, 1, strend);
 900                 goto restart;
 901             }
 902             else {
 903                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 904                       (long)(s - i_strpos)));
 905                 other_last = s; /* Fix this later. --Hugo */
 906                 s = saved_s;
 907                 if (t == strpos)
 908                     goto try_at_start;
 909                 goto try_at_offset;
 910             }
 911         }
 912     }
 913
 914
 915     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 916
 917     DEBUG_OPTIMISE_MORE_r(
 918         PerlIO_printf(Perl_debug_log,
 919             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 920             (IV)prog->check_offset_min,
 921             (IV)prog->check_offset_max,
 922             (IV)(s-strpos),
 923             (IV)(t-strpos),
 924             (IV)(t-s),
 925             (IV)(strend-strpos)
 926         )
 927     );
 928
 929     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 930         && (!utf8_target
 931             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 932                  && t > strpos)))
 933     {
 934         /* Fixed substring is found far enough so that the match
 935            cannot start at strpos. */
 936       try_at_offset:
 937         if (ml_anch && t[-1] != '\n') {
 938             /* Eventually fbm_*() should handle this, but often
 939                anchored_offset is not 0, so this check will not be wasted. */
 940             /* XXXX In the code below we prefer to look for "^" even in
 941                presence of anchored substrings.  And we search even
 942                beyond the found float position.  These pessimizations
 943                are historical artefacts only.  */
 944           find_anchor:
 945             while (t < strend - prog->minlen) {
 946                 if (*t == '\n') {
 947                     if (t < check_at - prog->check_offset_min) {
 948                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 949                             /* Since we moved from the found position,
 950                                we definitely contradict the found anchored
 951                                substr.  Due to the above check we do not
 952                                contradict "check" substr.
 953                                Thus we can arrive here only if check substr
 954                                is float.  Redo checking for "other"=="fixed".
 955                              */
 956                             strpos = t + 1;
 957                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 958                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 959                             goto do_other_anchored;
 960                         }
 961                         /* We don't contradict the found floating substring. */
 962                         /* XXXX Why not check for STCLASS? */
 963                         s = t + 1;
 964                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 965                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 966                         goto set_useful;
 967                     }
 968                     /* Position contradicts check-string */
 969                     /* XXXX probably better to look for check-string
 970                        than for "\n", so one should lower the limit for t? */
 971                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 972                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 973                     other_last = strpos = s = t + 1;
 974                     goto restart;
 975                 }
 976                 t++;
 977             }
 978             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 979                         PL_colors[0], PL_colors[1]));
 980             goto fail_finish;
 981         }
 982         else {
 983             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 984                         PL_colors[0], PL_colors[1]));
 985         }
 986         s = t;
 987       set_useful:
 988         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 989     }
 990     else {
 991         /* The found string does not prohibit matching at strpos,
 992            - no optimization of calling REx engine can be performed,
 993            unless it was an MBOL and we are not after MBOL,
 994            or a future STCLASS check will fail this. */
 995       try_at_start:
 996         /* Even in this situation we may use MBOL flag if strpos is offset
 997            wrt the start of the string. */
 998         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 999             && (strpos != strbeg) && strpos[-1] != '\n'
1000             /* May be due to an implicit anchor of m{.*foo}  */
1001             && !(prog->intflags & PREGf_IMPLICIT))
1002         {
1003             t = strpos;
1004             goto find_anchor;
1005         }
1006         DEBUG_EXECUTE_r( if (ml_anch)
1007             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1008                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1009         );
1010       success_at_start:
1011         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1012             && (utf8_target ? (
1013                 prog->check_utf8                /* Could be deleted already */
1014                 && --BmUSEFUL(prog->check_utf8) < 0
1015                 && (prog->check_utf8 == prog->float_utf8)
1016             ) : (
1017                 prog->check_substr              /* Could be deleted already */
1018                 && --BmUSEFUL(prog->check_substr) < 0
1019                 && (prog->check_substr == prog->float_substr)
1020             )))
1021         {
1022             /* If flags & SOMETHING - do not do it many times on the same match */
1023             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1024             /* XXX Does the destruction order has to change with utf8_target? */
1025             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1026             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1027             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1028             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1029             check = NULL;                       /* abort */
1030             s = strpos;
1031             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag
1032                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1033             if (prog->intflags & PREGf_IMPLICIT)
1034                 prog->extflags &= ~RXf_ANCH_MBOL;
1035             /* XXXX This is a remnant of the old implementation.  It
1036                     looks wasteful, since now INTUIT can use many
1037                     other heuristics. */
1038             prog->extflags &= ~RXf_USE_INTUIT;
1039             /* XXXX What other flags might need to be cleared in this branch? */
1040         }
1041         else
1042             s = strpos;
1043     }
1044
1045     /* Last resort... */
1046     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1047     /* trie stclasses are too expensive to use here, we are better off to
1048        leave it to regmatch itself */
1049     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1050         /* minlen == 0 is possible if regstclass is \b or \B,
1051            and the fixed substr is ''$.
1052            Since minlen is already taken into account, s+1 is before strend;
1053            accidentally, minlen >= 1 guaranties no false positives at s + 1
1054            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1055            regstclass does not come from lookahead...  */
1056         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1057            This leaves EXACTF only, which is dealt with in find_byclass().  */
1058         const U8* const str = (U8*)STRING(progi->regstclass);
1059         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1060                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1061                     : 1);
1062         char * endpos;
1063         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1064             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1065         else if (prog->float_substr || prog->float_utf8)
1066             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1067         else
1068             endpos= strend;
1069
1070         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1071                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1072
1073         t = s;
1074         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1075         if (!s) {
1076 #ifdef DEBUGGING
1077             const char *what = NULL;
1078 #endif
1079             if (endpos == strend) {
1080                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1081                                 "Could not match STCLASS...\n") );
1082                 goto fail;
1083             }
1084             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1085                                    "This position contradicts STCLASS...\n") );
1086             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1087                 goto fail;
1088             /* Contradict one of substrings */
1089             if (prog->anchored_substr || prog->anchored_utf8) {
1090                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1091                     DEBUG_EXECUTE_r( what = "anchored" );
1092                   hop_and_restart:
1093                     s = HOP3c(t, 1, strend);
1094                     if (s + start_shift + end_shift > strend) {
1095                         /* XXXX Should be taken into account earlier? */
1096                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1097                                                "Could not match STCLASS...\n") );
1098                         goto fail;
1099                     }
1100                     if (!check)
1101                         goto giveup;
1102                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1103                                 "Looking for %s substr starting at offset %ld...\n",
1104                                  what, (long)(s + start_shift - i_strpos)) );
1105                     goto restart;
1106                 }
1107                 /* Have both, check_string is floating */
1108                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1109                     goto retry_floating_check;
1110                 /* Recheck anchored substring, but not floating... */
1111                 s = check_at;
1112                 if (!check)
1113                     goto giveup;
1114                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1115                           "Looking for anchored substr starting at offset %ld...\n",
1116                           (long)(other_last - i_strpos)) );
1117                 goto do_other_anchored;
1118             }
1119             /* Another way we could have checked stclass at the
1120                current position only: */
1121             if (ml_anch) {
1122                 s = t = t + 1;
1123                 if (!check)
1124                     goto giveup;
1125                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1126                           "Looking for /%s^%s/m starting at offset %ld...\n",
1127                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1128                 goto try_at_offset;
1129             }
1130             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1131                 goto fail;
1132             /* Check is floating subtring. */
1133           retry_floating_check:
1134             t = check_at - start_shift;
1135             DEBUG_EXECUTE_r( what = "floating" );
1136             goto hop_and_restart;
1137         }
1138         if (t != s) {
1139             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1140                         "By STCLASS: moving %ld --> %ld\n",
1141                                   (long)(t - i_strpos), (long)(s - i_strpos))
1142                    );
1143         }
1144         else {
1145             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1146                                   "Does not contradict STCLASS...\n");
1147                    );
1148         }
1149     }
1150   giveup:
1151     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1152                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1153                           PL_colors[5], (long)(s - i_strpos)) );
1154     return s;
1155
1156   fail_finish:                          /* Substring not found */
1157     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1158         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1159   fail:
1160     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1161                           PL_colors[4], PL_colors[5]));
1162     return NULL;
1163 }
1164
1165 #define DECL_TRIE_TYPE(scan) \
1166     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1167                     trie_type = (scan->flags != EXACT) \
1168                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1169                               : (utf8_target ? trie_utf8 : trie_plain)
1170
1171 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1172 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1173     switch (trie_type) {                                                    \
1174     case trie_utf8_fold:                                                    \
1175         if ( foldlen>0 ) {                                                  \
1176             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1177             foldlen -= len;                                                 \
1178             uscan += len;                                                   \
1179             len=0;                                                          \
1180         } else {                                                            \
1181             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1182             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1183             foldlen -= UNISKIP( uvc );                                      \
1184             uscan = foldbuf + UNISKIP( uvc );                               \
1185         }                                                                   \
1186         break;                                                              \
1187     case trie_latin_utf8_fold:                                              \
1188         if ( foldlen>0 ) {                                                  \
1189             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1190             foldlen -= len;                                                 \
1191             uscan += len;                                                   \
1192             len=0;                                                          \
1193         } else {                                                            \
1194             len = 1;                                                        \
1195             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1196             foldlen -= UNISKIP( uvc );                                      \
1197             uscan = foldbuf + UNISKIP( uvc );                               \
1198         }                                                                   \
1199         break;                                                              \
1200     case trie_utf8:                                                         \
1201         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1202         break;                                                              \
1203     case trie_plain:                                                        \
1204         uvc = (UV)*uc;                                                      \
1205         len = 1;                                                            \
1206     }                                                                       \
1207     if (uvc < 256) {                                                        \
1208         charid = trie->charmap[ uvc ];                                      \
1209     }                                                                       \
1210     else {                                                                  \
1211         charid = 0;                                                         \
1212         if (widecharmap) {                                                  \
1213             SV** const svpp = hv_fetch(widecharmap,                         \
1214                         (char*)&uvc, sizeof(UV), 0);                        \
1215             if (svpp)                                                       \
1216                 charid = (U16)SvIV(*svpp);                                  \
1217         }                                                                   \
1218     }                                                                       \
1219 } STMT_END
1220
1221 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1222 {                                                      \
1223     char *my_strend= (char *)strend;                   \
1224     if ( (CoNd)                                        \
1225          && (ln == len ||                              \
1226              foldEQ_utf8(s, &my_strend, 0,  utf8_target,   \
1227                         m, NULL, ln, cBOOL(UTF_PATTERN)))      \
1228          && (!reginfo || regtry(reginfo, &s)) )        \
1229         goto got_it;                                   \
1230     else {                                             \
1231          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1232          uvchr_to_utf8(tmpbuf, c);                     \
1233          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1234          if ( f != c                                   \
1235               && (f == c1 || f == c2)                  \
1236               && (ln == len ||                         \
1237                 foldEQ_utf8(s, &my_strend, 0,  utf8_target,\
1238                               m, NULL, ln, cBOOL(UTF_PATTERN)))\
1239               && (!reginfo || regtry(reginfo, &s)) )   \
1240               goto got_it;                             \
1241     }                                                  \
1242 }                                                      \
1243 s += len
1244
1245 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1246 STMT_START {                                              \
1247     while (s <= e) {                                      \
1248         if ( (CoNd)                                       \
1249              && (ln == 1 || (OP(c) == EXACTF             \
1250                               ? foldEQ(s, m, ln)           \
1251                               : foldEQ_locale(s, m, ln)))  \
1252              && (!reginfo || regtry(reginfo, &s)) )        \
1253             goto got_it;                                  \
1254         s++;                                              \
1255     }                                                     \
1256 } STMT_END
1257
1258 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1259 STMT_START {                                          \
1260     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1261         CoDe                                          \
1262         s += uskip;                                   \
1263     }                                                 \
1264 } STMT_END
1265
1266 #define REXEC_FBC_SCAN(CoDe)                          \
1267 STMT_START {                                          \
1268     while (s < strend) {                              \
1269         CoDe                                          \
1270         s++;                                          \
1271     }                                                 \
1272 } STMT_END
1273
1274 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1275 REXEC_FBC_UTF8_SCAN(                                  \
1276     if (CoNd) {                                       \
1277         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1278             goto got_it;                              \
1279         else                                          \
1280             tmp = doevery;                            \
1281     }                                                 \
1282     else                                              \
1283         tmp = 1;                                      \
1284 )
1285
1286 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1287 REXEC_FBC_SCAN(                                       \
1288     if (CoNd) {                                       \
1289         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1290             goto got_it;                              \
1291         else                                          \
1292             tmp = doevery;                            \
1293     }                                                 \
1294     else                                              \
1295         tmp = 1;                                      \
1296 )
1297
1298 #define REXEC_FBC_TRYIT               \
1299 if ((!reginfo || regtry(reginfo, &s))) \
1300     goto got_it
1301
1302 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1303     if (utf8_target) {                                             \
1304         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1305     }                                                          \
1306     else {                                                     \
1307         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1308     }                                                          \
1309     break
1310
1311 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1312     if (utf8_target) {                                             \
1313         UtFpReLoAd;                                            \
1314         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1315     }                                                          \
1316     else {                                                     \
1317         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1318     }                                                          \
1319     break
1320
1321 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1322     PL_reg_flags |= RF_tainted;                                \
1323     if (utf8_target) {                                             \
1324         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1325     }                                                          \
1326     else {                                                     \
1327         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1328     }                                                          \
1329     break
1330
1331 #define DUMP_EXEC_POS(li,s,doutf8) \
1332     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1333
1334 /* We know what class REx starts with.  Try to find this position... */
1335 /* if reginfo is NULL, its a dryrun */
1336 /* annoyingly all the vars in this routine have different names from their counterparts
1337    in regmatch. /grrr */
1338
1339 STATIC char *
1340 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1341     const char *strend, regmatch_info *reginfo)
1342 {
1343         dVAR;
1344         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1345         char *m;
1346         STRLEN ln;
1347         STRLEN lnc;
1348         register STRLEN uskip;
1349         unsigned int c1;
1350         unsigned int c2;
1351         char *e;
1352         register I32 tmp = 1;   /* Scratch variable? */
1353         register const bool utf8_target = PL_reg_match_utf8;
1354         RXi_GET_DECL(prog,progi);
1355
1356         PERL_ARGS_ASSERT_FIND_BYCLASS;
1357
1358         /* We know what class it must start with. */
1359         switch (OP(c)) {
1360         case ANYOF:
1361             if (utf8_target) {
1362                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1363                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1364                           reginclass(prog, c, (U8*)s, 0, utf8_target) :
1365                           REGINCLASS(prog, c, (U8*)s));
1366             }
1367             else {
1368                  while (s < strend) {
1369                       STRLEN skip = 1;
1370
1371                       if (REGINCLASS(prog, c, (U8*)s) ||
1372                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1373                            /* The assignment of 2 is intentional:
1374                             * for the folded sharp s, the skip is 2. */
1375                            (skip = SHARP_S_SKIP))) {
1376                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1377                                 goto got_it;
1378                            else
1379                                 tmp = doevery;
1380                       }
1381                       else
1382                            tmp = 1;
1383                       s += skip;
1384                  }
1385             }
1386             break;
1387         case CANY:
1388             REXEC_FBC_SCAN(
1389                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1390                     goto got_it;
1391                 else
1392                     tmp = doevery;
1393             );
1394             break;
1395         case EXACTF:
1396             m   = STRING(c);
1397             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1398             lnc = (I32) ln;     /* length to match in characters */
1399             if (UTF_PATTERN) {
1400                 STRLEN ulen1, ulen2;
1401                 U8 *sm = (U8 *) m;
1402                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1403                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1404                 /* used by commented-out code below */
1405                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1406
1407                 /* XXX: Since the node will be case folded at compile
1408                    time this logic is a little odd, although im not
1409                    sure that its actually wrong. --dmq */
1410
1411                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1412                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1413
1414                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1415                    codepoint of the first character in the converted
1416                    form, yet originally we did the extra step.
1417                    No tests fail by commenting this code out however
1418                    so Ive left it out. -- dmq.
1419
1420                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1421                                     0, uniflags);
1422                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1423                                     0, uniflags);
1424                 */
1425
1426                 lnc = 0;
1427                 while (sm < ((U8 *) m + ln)) {
1428                     lnc++;
1429                     sm += UTF8SKIP(sm);
1430                 }
1431             }
1432             else {
1433                 c1 = *(U8*)m;
1434                 c2 = PL_fold[c1];
1435             }
1436             goto do_exactf;
1437         case EXACTFL:
1438             m   = STRING(c);
1439             ln  = STR_LEN(c);
1440             lnc = (I32) ln;
1441             c1 = *(U8*)m;
1442             c2 = PL_fold_locale[c1];
1443           do_exactf:
1444             e = HOP3c(strend, -((I32)lnc), s);
1445
1446             if (!reginfo && e < s)
1447                 e = s;                  /* Due to minlen logic of intuit() */
1448
1449             /* The idea in the EXACTF* cases is to first find the
1450              * first character of the EXACTF* node and then, if
1451              * necessary, case-insensitively compare the full
1452              * text of the node.  The c1 and c2 are the first
1453              * characters (though in Unicode it gets a bit
1454              * more complicated because there are more cases
1455              * than just upper and lower: one needs to use
1456              * the so-called folding case for case-insensitive
1457              * matching (called "loose matching" in Unicode).
1458              * foldEQ_utf8() will do just that. */
1459
1460             if (utf8_target || UTF_PATTERN) {
1461                 UV c, f;
1462                 U8 tmpbuf [UTF8_MAXBYTES+1];
1463                 STRLEN len = 1;
1464                 STRLEN foldlen;
1465                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1466                 if (c1 == c2) {
1467                     /* Upper and lower of 1st char are equal -
1468                      * probably not a "letter". */
1469                     while (s <= e) {
1470                         if (utf8_target) {
1471                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1472                                            uniflags);
1473                         } else {
1474                             c = *((U8*)s);
1475                         }
1476                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1477                     }
1478                 }
1479                 else {
1480                     while (s <= e) {
1481                         if (utf8_target) {
1482                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1483                                            uniflags);
1484                         } else {
1485                             c = *((U8*)s);
1486                         }
1487
1488                         /* Handle some of the three Greek sigmas cases.
1489                          * Note that not all the possible combinations
1490                          * are handled here: some of them are handled
1491                          * by the standard folding rules, and some of
1492                          * them (the character class or ANYOF cases)
1493                          * are handled during compiletime in
1494                          * regexec.c:S_regclass(). */
1495                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1496                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1497                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1498
1499                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1500                     }
1501                 }
1502             }
1503             else {
1504                 /* Neither pattern nor string are UTF8 */
1505                 if (c1 == c2)
1506                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1507                 else
1508                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1509             }
1510             break;
1511         case BOUNDL:
1512             PL_reg_flags |= RF_tainted;
1513             /* FALL THROUGH */
1514         case BOUND:
1515             if (utf8_target) {
1516                 if (s == PL_bostr)
1517                     tmp = '\n';
1518                 else {
1519                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1520                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1521                 }
1522                 tmp = ((OP(c) == BOUND ?
1523                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1524                 LOAD_UTF8_CHARCLASS_ALNUM();
1525                 REXEC_FBC_UTF8_SCAN(
1526                     if (tmp == !(OP(c) == BOUND ?
1527                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1528                                  isALNUM_LC_utf8((U8*)s)))
1529                     {
1530                         tmp = !tmp;
1531                         REXEC_FBC_TRYIT;
1532                 }
1533                 );
1534             }
1535             else {  /* Not utf8 */
1536                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1537                 tmp = cBOOL((OP(c) == BOUNDL)
1538                             ? isALNUM_LC(tmp)
1539                             : (isWORDCHAR_L1(tmp)
1540                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1541                 REXEC_FBC_SCAN(
1542                     if (tmp ==
1543                         !((OP(c) == BOUNDL)
1544                           ? isALNUM_LC(*s)
1545                           : (isWORDCHAR_L1((U8) *s)
1546                              && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1547                     {
1548                         tmp = !tmp;
1549                         REXEC_FBC_TRYIT;
1550                 }
1551                 );
1552             }
1553             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1554                 goto got_it;
1555             break;
1556         case NBOUNDL:
1557             PL_reg_flags |= RF_tainted;
1558             /* FALL THROUGH */
1559         case NBOUND:
1560             if (utf8_target) {
1561                 if (s == PL_bostr)
1562                     tmp = '\n';
1563                 else {
1564                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1565                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1566                 }
1567                 tmp = ((OP(c) == NBOUND ?
1568                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1569                 LOAD_UTF8_CHARCLASS_ALNUM();
1570                 REXEC_FBC_UTF8_SCAN(
1571                     if (tmp == !(OP(c) == NBOUND ?
1572                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1573                                  isALNUM_LC_utf8((U8*)s)))
1574                         tmp = !tmp;
1575                     else REXEC_FBC_TRYIT;
1576                 );
1577             }
1578             else {
1579                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1580                 tmp = cBOOL((OP(c) == NBOUNDL)
1581                             ? isALNUM_LC(tmp)
1582                             : (isWORDCHAR_L1(tmp)
1583                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1584                 REXEC_FBC_SCAN(
1585                     if (tmp == ! cBOOL(
1586                             (OP(c) == NBOUNDL)
1587                             ? isALNUM_LC(*s)
1588                             : (isWORDCHAR_L1((U8) *s)
1589                                && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1590                     {
1591                         tmp = !tmp;
1592                     }
1593                     else REXEC_FBC_TRYIT;
1594                 );
1595             }
1596             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1597                 goto got_it;
1598             break;
1599         case ALNUM:
1600             REXEC_FBC_CSCAN_PRELOAD(
1601                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1602                 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1603                 (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
1604             );
1605         case ALNUML:
1606             REXEC_FBC_CSCAN_TAINT(
1607                 isALNUM_LC_utf8((U8*)s),
1608                 isALNUM_LC(*s)
1609             );
1610         case NALNUM:
1611             REXEC_FBC_CSCAN_PRELOAD(
1612                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1613                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1614                 ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
1615             );
1616         case NALNUML:
1617             REXEC_FBC_CSCAN_TAINT(
1618                 !isALNUM_LC_utf8((U8*)s),
1619                 !isALNUM_LC(*s)
1620             );
1621         case SPACE:
1622             REXEC_FBC_CSCAN_PRELOAD(
1623                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1624                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1625                 isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
1626             );
1627         case SPACEL:
1628             REXEC_FBC_CSCAN_TAINT(
1629                 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1630                 isSPACE_LC(*s)
1631             );
1632         case NSPACE:
1633             REXEC_FBC_CSCAN_PRELOAD(
1634                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1635                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1636                 !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
1637             );
1638         case NSPACEL:
1639             REXEC_FBC_CSCAN_TAINT(
1640                 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1641                 !isSPACE_LC(*s)
1642             );
1643         case DIGIT:
1644             REXEC_FBC_CSCAN_PRELOAD(
1645                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1646                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1647                 isDIGIT(*s)
1648             );
1649         case DIGITL:
1650             REXEC_FBC_CSCAN_TAINT(
1651                 isDIGIT_LC_utf8((U8*)s),
1652                 isDIGIT_LC(*s)
1653             );
1654         case NDIGIT:
1655             REXEC_FBC_CSCAN_PRELOAD(
1656                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1657                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1658                 !isDIGIT(*s)
1659             );
1660         case NDIGITL:
1661             REXEC_FBC_CSCAN_TAINT(
1662                 !isDIGIT_LC_utf8((U8*)s),
1663                 !isDIGIT_LC(*s)
1664             );
1665         case LNBREAK:
1666             REXEC_FBC_CSCAN(
1667                 is_LNBREAK_utf8(s),
1668                 is_LNBREAK_latin1(s)
1669             );
1670         case VERTWS:
1671             REXEC_FBC_CSCAN(
1672                 is_VERTWS_utf8(s),
1673                 is_VERTWS_latin1(s)
1674             );
1675         case NVERTWS:
1676             REXEC_FBC_CSCAN(
1677                 !is_VERTWS_utf8(s),
1678                 !is_VERTWS_latin1(s)
1679             );
1680         case HORIZWS:
1681             REXEC_FBC_CSCAN(
1682                 is_HORIZWS_utf8(s),
1683                 is_HORIZWS_latin1(s)
1684             );
1685         case NHORIZWS:
1686             REXEC_FBC_CSCAN(
1687                 !is_HORIZWS_utf8(s),
1688                 !is_HORIZWS_latin1(s)
1689             );
1690         case AHOCORASICKC:
1691         case AHOCORASICK:
1692             {
1693                 DECL_TRIE_TYPE(c);
1694                 /* what trie are we using right now */
1695                 reg_ac_data *aho
1696                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1697                 reg_trie_data *trie
1698                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1699                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1700
1701                 const char *last_start = strend - trie->minlen;
1702 #ifdef DEBUGGING
1703                 const char *real_start = s;
1704 #endif
1705                 STRLEN maxlen = trie->maxlen;
1706                 SV *sv_points;
1707                 U8 **points; /* map of where we were in the input string
1708                                 when reading a given char. For ASCII this
1709                                 is unnecessary overhead as the relationship
1710                                 is always 1:1, but for Unicode, especially
1711                                 case folded Unicode this is not true. */
1712                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1713                 U8 *bitmap=NULL;
1714
1715
1716                 GET_RE_DEBUG_FLAGS_DECL;
1717
1718                 /* We can't just allocate points here. We need to wrap it in
1719                  * an SV so it gets freed properly if there is a croak while
1720                  * running the match */
1721                 ENTER;
1722                 SAVETMPS;
1723                 sv_points=newSV(maxlen * sizeof(U8 *));
1724                 SvCUR_set(sv_points,
1725                     maxlen * sizeof(U8 *));
1726                 SvPOK_on(sv_points);
1727                 sv_2mortal(sv_points);
1728                 points=(U8**)SvPV_nolen(sv_points );
1729                 if ( trie_type != trie_utf8_fold
1730                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1731                 {
1732                     if (trie->bitmap)
1733                         bitmap=(U8*)trie->bitmap;
1734                     else
1735                         bitmap=(U8*)ANYOF_BITMAP(c);
1736                 }
1737                 /* this is the Aho-Corasick algorithm modified a touch
1738                    to include special handling for long "unknown char"
1739                    sequences. The basic idea being that we use AC as long
1740                    as we are dealing with a possible matching char, when
1741                    we encounter an unknown char (and we have not encountered
1742                    an accepting state) we scan forward until we find a legal
1743                    starting char.
1744                    AC matching is basically that of trie matching, except
1745                    that when we encounter a failing transition, we fall back
1746                    to the current states "fail state", and try the current char
1747                    again, a process we repeat until we reach the root state,
1748                    state 1, or a legal transition. If we fail on the root state
1749                    then we can either terminate if we have reached an accepting
1750                    state previously, or restart the entire process from the beginning
1751                    if we have not.
1752
1753                  */
1754                 while (s <= last_start) {
1755                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1756                     U8 *uc = (U8*)s;
1757                     U16 charid = 0;
1758                     U32 base = 1;
1759                     U32 state = 1;
1760                     UV uvc = 0;
1761                     STRLEN len = 0;
1762                     STRLEN foldlen = 0;
1763                     U8 *uscan = (U8*)NULL;
1764                     U8 *leftmost = NULL;
1765 #ifdef DEBUGGING
1766                     U32 accepted_word= 0;
1767 #endif
1768                     U32 pointpos = 0;
1769
1770                     while ( state && uc <= (U8*)strend ) {
1771                         int failed=0;
1772                         U32 word = aho->states[ state ].wordnum;
1773
1774                         if( state==1 ) {
1775                             if ( bitmap ) {
1776                                 DEBUG_TRIE_EXECUTE_r(
1777                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1778                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1779                                             (char *)uc, utf8_target );
1780                                         PerlIO_printf( Perl_debug_log,
1781                                             " Scanning for legal start char...\n");
1782                                     }
1783                                 );
1784                                 if (utf8_target) {
1785                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1786                                         uc += UTF8SKIP(uc);
1787                                     }
1788                                 } else {
1789                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1790                                         uc++;
1791                                     }
1792                                 }
1793                                 s= (char *)uc;
1794                             }
1795                             if (uc >(U8*)last_start) break;
1796                         }
1797
1798                         if ( word ) {
1799                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1800                             if (!leftmost || lpos < leftmost) {
1801                                 DEBUG_r(accepted_word=word);
1802                                 leftmost= lpos;
1803                             }
1804                             if (base==0) break;
1805
1806                         }
1807                         points[pointpos++ % maxlen]= uc;
1808                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1809                                              uscan, len, uvc, charid, foldlen,
1810                                              foldbuf, uniflags);
1811                         DEBUG_TRIE_EXECUTE_r({
1812                             dump_exec_pos( (char *)uc, c, strend, real_start,
1813                                 s,   utf8_target );
1814                             PerlIO_printf(Perl_debug_log,
1815                                 " Charid:%3u CP:%4"UVxf" ",
1816                                  charid, uvc);
1817                         });
1818
1819                         do {
1820 #ifdef DEBUGGING
1821                             word = aho->states[ state ].wordnum;
1822 #endif
1823                             base = aho->states[ state ].trans.base;
1824
1825                             DEBUG_TRIE_EXECUTE_r({
1826                                 if (failed)
1827                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1828                                         s,   utf8_target );
1829                                 PerlIO_printf( Perl_debug_log,
1830                                     "%sState: %4"UVxf", word=%"UVxf,
1831                                     failed ? " Fail transition to " : "",
1832                                     (UV)state, (UV)word);
1833                             });
1834                             if ( base ) {
1835                                 U32 tmp;
1836                                 I32 offset;
1837                                 if (charid &&
1838                                      ( ((offset = base + charid
1839                                         - 1 - trie->uniquecharcount)) >= 0)
1840                                      && ((U32)offset < trie->lasttrans)
1841                                      && trie->trans[offset].check == state
1842                                      && (tmp=trie->trans[offset].next))
1843                                 {
1844                                     DEBUG_TRIE_EXECUTE_r(
1845                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1846                                     state = tmp;
1847                                     break;
1848                                 }
1849                                 else {
1850                                     DEBUG_TRIE_EXECUTE_r(
1851                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1852                                     failed = 1;
1853                                     state = aho->fail[state];
1854                                 }
1855                             }
1856                             else {
1857                                 /* we must be accepting here */
1858                                 DEBUG_TRIE_EXECUTE_r(
1859                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1860                                 failed = 1;
1861                                 break;
1862                             }
1863                         } while(state);
1864                         uc += len;
1865                         if (failed) {
1866                             if (leftmost)
1867                                 break;
1868                             if (!state) state = 1;
1869                         }
1870                     }
1871                     if ( aho->states[ state ].wordnum ) {
1872                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1873                         if (!leftmost || lpos < leftmost) {
1874                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1875                             leftmost = lpos;
1876                         }
1877                     }
1878                     if (leftmost) {
1879                         s = (char*)leftmost;
1880                         DEBUG_TRIE_EXECUTE_r({
1881                             PerlIO_printf(
1882                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1883                                 (UV)accepted_word, (IV)(s - real_start)
1884                             );
1885                         });
1886                         if (!reginfo || regtry(reginfo, &s)) {
1887                             FREETMPS;
1888                             LEAVE;
1889                             goto got_it;
1890                         }
1891                         s = HOPc(s,1);
1892                         DEBUG_TRIE_EXECUTE_r({
1893                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1894                         });
1895                     } else {
1896                         DEBUG_TRIE_EXECUTE_r(
1897                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1898                         break;
1899                     }
1900                 }
1901                 FREETMPS;
1902                 LEAVE;
1903             }
1904             break;
1905         default:
1906             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1907             break;
1908         }
1909         return 0;
1910       got_it:
1911         return s;
1912 }
1913
1914
1915 /*
1916  - regexec_flags - match a regexp against a string
1917  */
1918 I32
1919 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1920               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1921 /* strend: pointer to null at end of string */
1922 /* strbeg: real beginning of string */
1923 /* minend: end of match must be >=minend after stringarg. */
1924 /* data: May be used for some additional optimizations.
1925          Currently its only used, with a U32 cast, for transmitting
1926          the ganch offset when doing a /g match. This will change */
1927 /* nosave: For optimizations. */
1928 {
1929     dVAR;
1930     struct regexp *const prog = (struct regexp *)SvANY(rx);
1931     /*register*/ char *s;
1932     register regnode *c;
1933     /*register*/ char *startpos = stringarg;
1934     I32 minlen;         /* must match at least this many chars */
1935     I32 dontbother = 0; /* how many characters not to try at end */
1936     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1937     I32 scream_pos = -1;                /* Internal iterator of scream. */
1938     char *scream_olds = NULL;
1939     const bool utf8_target = cBOOL(DO_UTF8(sv));
1940     I32 multiline;
1941     RXi_GET_DECL(prog,progi);
1942     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1943     regexp_paren_pair *swap = NULL;
1944     GET_RE_DEBUG_FLAGS_DECL;
1945
1946     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1947     PERL_UNUSED_ARG(data);
1948
1949     /* Be paranoid... */
1950     if (prog == NULL || startpos == NULL) {
1951         Perl_croak(aTHX_ "NULL regexp parameter");
1952         return 0;
1953     }
1954
1955     multiline = prog->extflags & RXf_PMf_MULTILINE;
1956     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1957
1958     RX_MATCH_UTF8_set(rx, utf8_target);
1959     DEBUG_EXECUTE_r(
1960         debug_start_match(rx, utf8_target, startpos, strend,
1961         "Matching");
1962     );
1963
1964     minlen = prog->minlen;
1965
1966     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1967         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1968                               "String too short [regexec_flags]...\n"));
1969         goto phooey;
1970     }
1971
1972
1973     /* Check validity of program. */
1974     if (UCHARAT(progi->program) != REG_MAGIC) {
1975         Perl_croak(aTHX_ "corrupted regexp program");
1976     }
1977
1978     PL_reg_flags = 0;
1979     PL_reg_eval_set = 0;
1980     PL_reg_maxiter = 0;
1981
1982     if (RX_UTF8(rx))
1983         PL_reg_flags |= RF_utf8;
1984
1985     /* Mark beginning of line for ^ and lookbehind. */
1986     reginfo.bol = startpos; /* XXX not used ??? */
1987     PL_bostr  = strbeg;
1988     reginfo.sv = sv;
1989
1990     /* Mark end of line for $ (and such) */
1991     PL_regeol = strend;
1992
1993     /* see how far we have to get to not match where we matched before */
1994     reginfo.till = startpos+minend;
1995
1996     /* If there is a "must appear" string, look for it. */
1997     s = startpos;
1998
1999     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2000         MAGIC *mg;
2001         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2002             reginfo.ganch = startpos + prog->gofs;
2003             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2004               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2005         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2006                   && SvMAGIC(sv)
2007                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2008                   && mg->mg_len >= 0) {
2009             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2010             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2011                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2012
2013             if (prog->extflags & RXf_ANCH_GPOS) {
2014                 if (s > reginfo.ganch)
2015                     goto phooey;
2016                 s = reginfo.ganch - prog->gofs;
2017                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2018                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2019                 if (s < strbeg)
2020                     goto phooey;
2021             }
2022         }
2023         else if (data) {
2024             reginfo.ganch = strbeg + PTR2UV(data);
2025             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2026                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2027
2028         } else {                                /* pos() not defined */
2029             reginfo.ganch = strbeg;
2030             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2031                  "GPOS: reginfo.ganch = strbeg\n"));
2032         }
2033     }
2034     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2035         /* We have to be careful. If the previous successful match
2036            was from this regex we don't want a subsequent partially
2037            successful match to clobber the old results.
2038            So when we detect this possibility we add a swap buffer
2039            to the re, and switch the buffer each match. If we fail
2040            we switch it back, otherwise we leave it swapped.
2041         */
2042         swap = prog->offs;
2043         /* do we need a save destructor here for eval dies? */
2044         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2045     }
2046     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2047         re_scream_pos_data d;
2048
2049         d.scream_olds = &scream_olds;
2050         d.scream_pos = &scream_pos;
2051         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2052         if (!s) {
2053             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2054             goto phooey;        /* not present */
2055         }
2056     }
2057
2058
2059
2060     /* Simplest case:  anchored match need be tried only once. */
2061     /*  [unless only anchor is BOL and multiline is set] */
2062     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2063         if (s == startpos && regtry(&reginfo, &startpos))
2064             goto got_it;
2065         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2066                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2067         {
2068             char *end;
2069
2070             if (minlen)
2071                 dontbother = minlen - 1;
2072             end = HOP3c(strend, -dontbother, strbeg) - 1;
2073             /* for multiline we only have to try after newlines */
2074             if (prog->check_substr || prog->check_utf8) {
2075                 /* because of the goto we can not easily reuse the macros for bifurcating the
2076                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2077                 if (utf8_target) {
2078                     if (s == startpos)
2079                         goto after_try_utf8;
2080                     while (1) {
2081                         if (regtry(&reginfo, &s)) {
2082                             goto got_it;
2083                         }
2084                       after_try_utf8:
2085                         if (s > end) {
2086                             goto phooey;
2087                         }
2088                         if (prog->extflags & RXf_USE_INTUIT) {
2089                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2090                             if (!s) {
2091                                 goto phooey;
2092                             }
2093                         }
2094                         else {
2095                             s += UTF8SKIP(s);
2096                         }
2097                     }
2098                 } /* end search for check string in unicode */
2099                 else {
2100                     if (s == startpos) {
2101                         goto after_try_latin;
2102                     }
2103                     while (1) {
2104                         if (regtry(&reginfo, &s)) {
2105                             goto got_it;
2106                         }
2107                       after_try_latin:
2108                         if (s > end) {
2109                             goto phooey;
2110                         }
2111                         if (prog->extflags & RXf_USE_INTUIT) {
2112                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2113                             if (!s) {
2114                                 goto phooey;
2115                             }
2116                         }
2117                         else {
2118                             s++;
2119                         }
2120                     }
2121                 } /* end search for check string in latin*/
2122             } /* end search for check string */
2123             else { /* search for newline */
2124                 if (s > startpos) {
2125                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2126                     s--;
2127                 }
2128                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2129                 while (s < end) {
2130                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2131                         if (regtry(&reginfo, &s))
2132                             goto got_it;
2133                     }
2134                 }
2135             } /* end search for newline */
2136         } /* end anchored/multiline check string search */
2137         goto phooey;
2138     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2139     {
2140         /* the warning about reginfo.ganch being used without intialization
2141            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2142            and we only enter this block when the same bit is set. */
2143         char *tmp_s = reginfo.ganch - prog->gofs;
2144
2145         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2146             goto got_it;
2147         goto phooey;
2148     }
2149
2150     /* Messy cases:  unanchored match. */
2151     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2152         /* we have /x+whatever/ */
2153         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2154         char ch;
2155 #ifdef DEBUGGING
2156         int did_match = 0;
2157 #endif
2158         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2159             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2160         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2161
2162         if (utf8_target) {
2163             REXEC_FBC_SCAN(
2164                 if (*s == ch) {
2165                     DEBUG_EXECUTE_r( did_match = 1 );
2166                     if (regtry(&reginfo, &s)) goto got_it;
2167                     s += UTF8SKIP(s);
2168                     while (s < strend && *s == ch)
2169                         s += UTF8SKIP(s);
2170                 }
2171             );
2172         }
2173         else {
2174             REXEC_FBC_SCAN(
2175                 if (*s == ch) {
2176                     DEBUG_EXECUTE_r( did_match = 1 );
2177                     if (regtry(&reginfo, &s)) goto got_it;
2178                     s++;
2179                     while (s < strend && *s == ch)
2180                         s++;
2181                 }
2182             );
2183         }
2184         DEBUG_EXECUTE_r(if (!did_match)
2185                 PerlIO_printf(Perl_debug_log,
2186                                   "Did not find anchored character...\n")
2187                );
2188     }
2189     else if (prog->anchored_substr != NULL
2190               || prog->anchored_utf8 != NULL
2191               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2192                   && prog->float_max_offset < strend - s)) {
2193         SV *must;
2194         I32 back_max;
2195         I32 back_min;
2196         char *last;
2197         char *last1;            /* Last position checked before */
2198 #ifdef DEBUGGING
2199         int did_match = 0;
2200 #endif
2201         if (prog->anchored_substr || prog->anchored_utf8) {
2202             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2203                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2204             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2205             back_max = back_min = prog->anchored_offset;
2206         } else {
2207             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2208                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2209             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2210             back_max = prog->float_max_offset;
2211             back_min = prog->float_min_offset;
2212         }
2213
2214
2215         if (must == &PL_sv_undef)
2216             /* could not downgrade utf8 check substring, so must fail */
2217             goto phooey;
2218
2219         if (back_min<0) {
2220             last = strend;
2221         } else {
2222             last = HOP3c(strend,        /* Cannot start after this */
2223                   -(I32)(CHR_SVLEN(must)
2224                          - (SvTAIL(must) != 0) + back_min), strbeg);
2225         }
2226         if (s > PL_bostr)
2227             last1 = HOPc(s, -1);
2228         else
2229             last1 = s - 1;      /* bogus */
2230
2231         /* XXXX check_substr already used to find "s", can optimize if
2232            check_substr==must. */
2233         scream_pos = -1;
2234         dontbother = end_shift;
2235         strend = HOPc(strend, -dontbother);
2236         while ( (s <= last) &&
2237                 ((flags & REXEC_SCREAM)
2238                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2239                                     end_shift, &scream_pos, 0))
2240                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2241                                   (unsigned char*)strend, must,
2242                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2243             /* we may be pointing at the wrong string */
2244             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2245                 s = strbeg + (s - SvPVX_const(sv));
2246             DEBUG_EXECUTE_r( did_match = 1 );
2247             if (HOPc(s, -back_max) > last1) {
2248                 last1 = HOPc(s, -back_min);
2249                 s = HOPc(s, -back_max);
2250             }
2251             else {
2252                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2253
2254                 last1 = HOPc(s, -back_min);
2255                 s = t;
2256             }
2257             if (utf8_target) {
2258                 while (s <= last1) {
2259                     if (regtry(&reginfo, &s))
2260                         goto got_it;
2261                     s += UTF8SKIP(s);
2262                 }
2263             }
2264             else {
2265                 while (s <= last1) {
2266                     if (regtry(&reginfo, &s))
2267                         goto got_it;
2268                     s++;
2269                 }
2270             }
2271         }
2272         DEBUG_EXECUTE_r(if (!did_match) {
2273             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2274                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2275             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2276                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2277                                ? "anchored" : "floating"),
2278                 quoted, RE_SV_TAIL(must));
2279         });
2280         goto phooey;
2281     }
2282     else if ( (c = progi->regstclass) ) {
2283         if (minlen) {
2284             const OPCODE op = OP(progi->regstclass);
2285             /* don't bother with what can't match */
2286             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2287                 strend = HOPc(strend, -(minlen - 1));
2288         }
2289         DEBUG_EXECUTE_r({
2290             SV * const prop = sv_newmortal();
2291             regprop(prog, prop, c);
2292             {
2293                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2294                     s,strend-s,60);
2295                 PerlIO_printf(Perl_debug_log,
2296                     "Matching stclass %.*s against %s (%d bytes)\n",
2297                     (int)SvCUR(prop), SvPVX_const(prop),
2298                      quoted, (int)(strend - s));
2299             }
2300         });
2301         if (find_byclass(prog, c, s, strend, &reginfo))
2302             goto got_it;
2303         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2304     }
2305     else {
2306         dontbother = 0;
2307         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2308             /* Trim the end. */
2309             char *last;
2310             SV* float_real;
2311
2312             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2313                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2314             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2315
2316             if (flags & REXEC_SCREAM) {
2317                 last = screaminstr(sv, float_real, s - strbeg,
2318                                    end_shift, &scream_pos, 1); /* last one */
2319                 if (!last)
2320                     last = scream_olds; /* Only one occurrence. */
2321                 /* we may be pointing at the wrong string */
2322                 else if (RXp_MATCH_COPIED(prog))
2323                     s = strbeg + (s - SvPVX_const(sv));
2324             }
2325             else {
2326                 STRLEN len;
2327                 const char * const little = SvPV_const(float_real, len);
2328
2329                 if (SvTAIL(float_real)) {
2330                     if (memEQ(strend - len + 1, little, len - 1))
2331                         last = strend - len + 1;
2332                     else if (!multiline)
2333                         last = memEQ(strend - len, little, len)
2334                             ? strend - len : NULL;
2335                     else
2336                         goto find_last;
2337                 } else {
2338                   find_last:
2339                     if (len)
2340                         last = rninstr(s, strend, little, little + len);
2341                     else
2342                         last = strend;  /* matching "$" */
2343                 }
2344             }
2345             if (last == NULL) {
2346                 DEBUG_EXECUTE_r(
2347                     PerlIO_printf(Perl_debug_log,
2348                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2349                         PL_colors[4], PL_colors[5]));
2350                 goto phooey; /* Should not happen! */
2351             }
2352             dontbother = strend - last + prog->float_min_offset;
2353         }
2354         if (minlen && (dontbother < minlen))
2355             dontbother = minlen - 1;
2356         strend -= dontbother;              /* this one's always in bytes! */
2357         /* We don't know much -- general case. */
2358         if (utf8_target) {
2359             for (;;) {
2360                 if (regtry(&reginfo, &s))
2361                     goto got_it;
2362                 if (s >= strend)
2363                     break;
2364                 s += UTF8SKIP(s);
2365             };
2366         }
2367         else {
2368             do {
2369                 if (regtry(&reginfo, &s))
2370                     goto got_it;
2371             } while (s++ < strend);
2372         }
2373     }
2374
2375     /* Failure. */
2376     goto phooey;
2377
2378 got_it:
2379     Safefree(swap);
2380     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2381
2382     if (PL_reg_eval_set)
2383         restore_pos(aTHX_ prog);
2384     if (RXp_PAREN_NAMES(prog))
2385         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2386
2387     /* make sure $`, $&, $', and $digit will work later */
2388     if ( !(flags & REXEC_NOT_FIRST) ) {
2389         RX_MATCH_COPY_FREE(rx);
2390         if (flags & REXEC_COPY_STR) {
2391             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2392 #ifdef PERL_OLD_COPY_ON_WRITE
2393             if ((SvIsCOW(sv)
2394                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2395                 if (DEBUG_C_TEST) {
2396                     PerlIO_printf(Perl_debug_log,
2397                                   "Copy on write: regexp capture, type %d\n",
2398                                   (int) SvTYPE(sv));
2399                 }
2400                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2401                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2402                 assert (SvPOKp(prog->saved_copy));
2403             } else
2404 #endif
2405             {
2406                 RX_MATCH_COPIED_on(rx);
2407                 s = savepvn(strbeg, i);
2408                 prog->subbeg = s;
2409             }
2410             prog->sublen = i;
2411         }
2412         else {
2413             prog->subbeg = strbeg;
2414             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2415         }
2416     }
2417
2418     return 1;
2419
2420 phooey:
2421     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2422                           PL_colors[4], PL_colors[5]));
2423     if (PL_reg_eval_set)
2424         restore_pos(aTHX_ prog);
2425     if (swap) {
2426         /* we failed :-( roll it back */
2427         Safefree(prog->offs);
2428         prog->offs = swap;
2429     }
2430
2431     return 0;
2432 }
2433
2434
2435 /*
2436  - regtry - try match at specific point
2437  */
2438 STATIC I32                      /* 0 failure, 1 success */
2439 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2440 {
2441     dVAR;
2442     CHECKPOINT lastcp;
2443     REGEXP *const rx = reginfo->prog;
2444     regexp *const prog = (struct regexp *)SvANY(rx);
2445     RXi_GET_DECL(prog,progi);
2446     GET_RE_DEBUG_FLAGS_DECL;
2447
2448     PERL_ARGS_ASSERT_REGTRY;
2449
2450     reginfo->cutpoint=NULL;
2451
2452     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2453         MAGIC *mg;
2454
2455         PL_reg_eval_set = RS_init;
2456         DEBUG_EXECUTE_r(DEBUG_s(
2457             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2458                           (IV)(PL_stack_sp - PL_stack_base));
2459             ));
2460         SAVESTACK_CXPOS();
2461         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2462         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2463         SAVETMPS;
2464         /* Apparently this is not needed, judging by wantarray. */
2465         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2466            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2467
2468         if (reginfo->sv) {
2469             /* Make $_ available to executed code. */
2470             if (reginfo->sv != DEFSV) {
2471                 SAVE_DEFSV;
2472                 DEFSV_set(reginfo->sv);
2473             }
2474
2475             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2476                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2477                 /* prepare for quick setting of pos */
2478 #ifdef PERL_OLD_COPY_ON_WRITE
2479                 if (SvIsCOW(reginfo->sv))
2480                     sv_force_normal_flags(reginfo->sv, 0);
2481 #endif
2482                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2483                                  &PL_vtbl_mglob, NULL, 0);
2484                 mg->mg_len = -1;
2485             }
2486             PL_reg_magic    = mg;
2487             PL_reg_oldpos   = mg->mg_len;
2488             SAVEDESTRUCTOR_X(restore_pos, prog);
2489         }
2490         if (!PL_reg_curpm) {
2491             Newxz(PL_reg_curpm, 1, PMOP);
2492 #ifdef USE_ITHREADS
2493             {
2494                 SV* const repointer = &PL_sv_undef;
2495                 /* this regexp is also owned by the new PL_reg_curpm, which
2496                    will try to free it.  */
2497                 av_push(PL_regex_padav, repointer);
2498                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2499                 PL_regex_pad = AvARRAY(PL_regex_padav);
2500             }
2501 #endif
2502         }
2503 #ifdef USE_ITHREADS
2504         /* It seems that non-ithreads works both with and without this code.
2505            So for efficiency reasons it seems best not to have the code
2506            compiled when it is not needed.  */
2507         /* This is safe against NULLs: */
2508         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2509         /* PM_reg_curpm owns a reference to this regexp.  */
2510         ReREFCNT_inc(rx);
2511 #endif
2512         PM_SETRE(PL_reg_curpm, rx);
2513         PL_reg_oldcurpm = PL_curpm;
2514         PL_curpm = PL_reg_curpm;
2515         if (RXp_MATCH_COPIED(prog)) {
2516             /*  Here is a serious problem: we cannot rewrite subbeg,
2517                 since it may be needed if this match fails.  Thus
2518                 $` inside (?{}) could fail... */
2519             PL_reg_oldsaved = prog->subbeg;
2520             PL_reg_oldsavedlen = prog->sublen;
2521 #ifdef PERL_OLD_COPY_ON_WRITE
2522             PL_nrs = prog->saved_copy;
2523 #endif
2524             RXp_MATCH_COPIED_off(prog);
2525         }
2526         else
2527             PL_reg_oldsaved = NULL;
2528         prog->subbeg = PL_bostr;
2529         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2530     }
2531     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2532     prog->offs[0].start = *startpos - PL_bostr;
2533     PL_reginput = *startpos;
2534     PL_reglastparen = &prog->lastparen;
2535     PL_reglastcloseparen = &prog->lastcloseparen;
2536     prog->lastparen = 0;
2537     prog->lastcloseparen = 0;
2538     PL_regsize = 0;
2539     PL_regoffs = prog->offs;
2540     if (PL_reg_start_tmpl <= prog->nparens) {
2541         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2542         if(PL_reg_start_tmp)
2543             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2544         else
2545             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2546     }
2547
2548     /* XXXX What this code is doing here?!!!  There should be no need
2549        to do this again and again, PL_reglastparen should take care of
2550        this!  --ilya*/
2551
2552     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2553      * Actually, the code in regcppop() (which Ilya may be meaning by
2554      * PL_reglastparen), is not needed at all by the test suite
2555      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2556      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2557      * Meanwhile, this code *is* needed for the
2558      * above-mentioned test suite tests to succeed.  The common theme
2559      * on those tests seems to be returning null fields from matches.
2560      * --jhi updated by dapm */
2561 #if 1
2562     if (prog->nparens) {
2563         regexp_paren_pair *pp = PL_regoffs;
2564         register I32 i;
2565         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2566             ++pp;
2567             pp->start = -1;
2568             pp->end = -1;
2569         }
2570     }
2571 #endif
2572     REGCP_SET(lastcp);
2573     if (regmatch(reginfo, progi->program + 1)) {
2574         PL_regoffs[0].end = PL_reginput - PL_bostr;
2575         return 1;
2576     }
2577     if (reginfo->cutpoint)
2578         *startpos= reginfo->cutpoint;
2579     REGCP_UNWIND(lastcp);
2580     return 0;
2581 }
2582
2583
2584 #define sayYES goto yes
2585 #define sayNO goto no
2586 #define sayNO_SILENT goto no_silent
2587
2588 /* we dont use STMT_START/END here because it leads to
2589    "unreachable code" warnings, which are bogus, but distracting. */
2590 #define CACHEsayNO \
2591     if (ST.cache_mask) \
2592        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2593     sayNO
2594
2595 /* this is used to determine how far from the left messages like
2596    'failed...' are printed. It should be set such that messages
2597    are inline with the regop output that created them.
2598 */
2599 #define REPORT_CODE_OFF 32
2600
2601
2602 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2603 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2604
2605 #define SLAB_FIRST(s) (&(s)->states[0])
2606 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2607
2608 /* grab a new slab and return the first slot in it */
2609
2610 STATIC regmatch_state *
2611 S_push_slab(pTHX)
2612 {
2613 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2614     dMY_CXT;
2615 #endif
2616     regmatch_slab *s = PL_regmatch_slab->next;
2617     if (!s) {
2618         Newx(s, 1, regmatch_slab);
2619         s->prev = PL_regmatch_slab;
2620         s->next = NULL;
2621         PL_regmatch_slab->next = s;
2622     }
2623     PL_regmatch_slab = s;
2624     return SLAB_FIRST(s);
2625 }
2626
2627
2628 /* push a new state then goto it */
2629
2630 #define PUSH_STATE_GOTO(state, node) \
2631     scan = node; \
2632     st->resume_state = state; \
2633     goto push_state;
2634
2635 /* push a new state with success backtracking, then goto it */
2636
2637 #define PUSH_YES_STATE_GOTO(state, node) \
2638     scan = node; \
2639     st->resume_state = state; \
2640     goto push_yes_state;
2641
2642
2643
2644 /*
2645
2646 regmatch() - main matching routine
2647
2648 This is basically one big switch statement in a loop. We execute an op,
2649 set 'next' to point the next op, and continue. If we come to a point which
2650 we may need to backtrack to on failure such as (A|B|C), we push a
2651 backtrack state onto the backtrack stack. On failure, we pop the top
2652 state, and re-enter the loop at the state indicated. If there are no more
2653 states to pop, we return failure.
2654
2655 Sometimes we also need to backtrack on success; for example /A+/, where
2656 after successfully matching one A, we need to go back and try to
2657 match another one; similarly for lookahead assertions: if the assertion
2658 completes successfully, we backtrack to the state just before the assertion
2659 and then carry on.  In these cases, the pushed state is marked as
2660 'backtrack on success too'. This marking is in fact done by a chain of
2661 pointers, each pointing to the previous 'yes' state. On success, we pop to
2662 the nearest yes state, discarding any intermediate failure-only states.
2663 Sometimes a yes state is pushed just to force some cleanup code to be
2664 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2665 it to free the inner regex.
2666
2667 Note that failure backtracking rewinds the cursor position, while
2668 success backtracking leaves it alone.
2669
2670 A pattern is complete when the END op is executed, while a subpattern
2671 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2672 ops trigger the "pop to last yes state if any, otherwise return true"
2673 behaviour.
2674
2675 A common convention in this function is to use A and B to refer to the two
2676 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2677 the subpattern to be matched possibly multiple times, while B is the entire
2678 rest of the pattern. Variable and state names reflect this convention.
2679
2680 The states in the main switch are the union of ops and failure/success of
2681 substates associated with with that op.  For example, IFMATCH is the op
2682 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2683 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2684 successfully matched A and IFMATCH_A_fail is a state saying that we have
2685 just failed to match A. Resume states always come in pairs. The backtrack
2686 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2687 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2688 on success or failure.
2689
2690 The struct that holds a backtracking state is actually a big union, with
2691 one variant for each major type of op. The variable st points to the
2692 top-most backtrack struct. To make the code clearer, within each
2693 block of code we #define ST to alias the relevant union.
2694
2695 Here's a concrete example of a (vastly oversimplified) IFMATCH
2696 implementation:
2697
2698     switch (state) {
2699     ....
2700
2701 #define ST st->u.ifmatch
2702
2703     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2704         ST.foo = ...; // some state we wish to save
2705         ...
2706         // push a yes backtrack state with a resume value of
2707         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2708         // first node of A:
2709         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2710         // NOTREACHED
2711
2712     case IFMATCH_A: // we have successfully executed A; now continue with B
2713         next = B;
2714         bar = ST.foo; // do something with the preserved value
2715         break;
2716
2717     case IFMATCH_A_fail: // A failed, so the assertion failed
2718         ...;   // do some housekeeping, then ...
2719         sayNO; // propagate the failure
2720
2721 #undef ST
2722
2723     ...
2724     }
2725
2726 For any old-timers reading this who are familiar with the old recursive
2727 approach, the code above is equivalent to:
2728
2729     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2730     {
2731         int foo = ...
2732         ...
2733         if (regmatch(A)) {
2734             next = B;
2735             bar = foo;
2736             break;
2737         }
2738         ...;   // do some housekeeping, then ...
2739         sayNO; // propagate the failure
2740     }
2741
2742 The topmost backtrack state, pointed to by st, is usually free. If you
2743 want to claim it, populate any ST.foo fields in it with values you wish to
2744 save, then do one of
2745
2746         PUSH_STATE_GOTO(resume_state, node);
2747         PUSH_YES_STATE_GOTO(resume_state, node);
2748
2749 which sets that backtrack state's resume value to 'resume_state', pushes a
2750 new free entry to the top of the backtrack stack, then goes to 'node'.
2751 On backtracking, the free slot is popped, and the saved state becomes the
2752 new free state. An ST.foo field in this new top state can be temporarily
2753 accessed to retrieve values, but once the main loop is re-entered, it
2754 becomes available for reuse.
2755
2756 Note that the depth of the backtrack stack constantly increases during the
2757 left-to-right execution of the pattern, rather than going up and down with
2758 the pattern nesting. For example the stack is at its maximum at Z at the
2759 end of the pattern, rather than at X in the following:
2760
2761     /(((X)+)+)+....(Y)+....Z/
2762
2763 The only exceptions to this are lookahead/behind assertions and the cut,
2764 (?>A), which pop all the backtrack states associated with A before
2765 continuing.
2766
2767 Bascktrack state structs are allocated in slabs of about 4K in size.
2768 PL_regmatch_state and st always point to the currently active state,
2769 and PL_regmatch_slab points to the slab currently containing
2770 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2771 allocated, and is never freed until interpreter destruction. When the slab
2772 is full, a new one is allocated and chained to the end. At exit from
2773 regmatch(), slabs allocated since entry are freed.
2774
2775 */
2776
2777
2778 #define DEBUG_STATE_pp(pp)                                  \
2779     DEBUG_STATE_r({                                         \
2780         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2781         PerlIO_printf(Perl_debug_log,                       \
2782             "    %*s"pp" %s%s%s%s%s\n",                     \
2783             depth*2, "",                                    \
2784             PL_reg_name[st->resume_state],                     \
2785             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2786             ((st==yes_state) ? "Y" : ""),                   \
2787             ((st==mark_state) ? "M" : ""),                  \
2788             ((st==yes_state||st==mark_state) ? "]" : "")    \
2789         );                                                  \
2790     });
2791
2792
2793 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2794
2795 #ifdef DEBUGGING
2796
2797 STATIC void
2798 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2799     const char *start, const char *end, const char *blurb)
2800 {
2801     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2802
2803     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2804
2805     if (!PL_colorset)
2806             reginitcolors();
2807     {
2808         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2809             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2810
2811         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2812             start, end - start, 60);
2813
2814         PerlIO_printf(Perl_debug_log,
2815             "%s%s REx%s %s against %s\n",
2816                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2817
2818         if (utf8_target||utf8_pat)
2819             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2820                 utf8_pat ? "pattern" : "",
2821                 utf8_pat && utf8_target ? " and " : "",
2822                 utf8_target ? "string" : ""
2823             );
2824     }
2825 }
2826
2827 STATIC void
2828 S_dump_exec_pos(pTHX_ const char *locinput,
2829                       const regnode *scan,
2830                       const char *loc_regeol,
2831                       const char *loc_bostr,
2832                       const char *loc_reg_starttry,
2833                       const bool utf8_target)
2834 {
2835     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2836     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2837     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2838     /* The part of the string before starttry has one color
2839        (pref0_len chars), between starttry and current
2840        position another one (pref_len - pref0_len chars),
2841        after the current position the third one.
2842        We assume that pref0_len <= pref_len, otherwise we
2843        decrease pref0_len.  */
2844     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2845         ? (5 + taill) - l : locinput - loc_bostr;
2846     int pref0_len;
2847
2848     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2849
2850     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2851         pref_len++;
2852     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2853     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2854         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2855               ? (5 + taill) - pref_len : loc_regeol - locinput);
2856     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2857         l--;
2858     if (pref0_len < 0)
2859         pref0_len = 0;
2860     if (pref0_len > pref_len)
2861         pref0_len = pref_len;
2862     {
2863         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2864
2865         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2866             (locinput - pref_len),pref0_len, 60, 4, 5);
2867
2868         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2869                     (locinput - pref_len + pref0_len),
2870                     pref_len - pref0_len, 60, 2, 3);
2871
2872         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2873                     locinput, loc_regeol - locinput, 10, 0, 1);
2874
2875         const STRLEN tlen=len0+len1+len2;
2876         PerlIO_printf(Perl_debug_log,
2877                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2878                     (IV)(locinput - loc_bostr),
2879                     len0, s0,
2880                     len1, s1,
2881                     (docolor ? "" : "> <"),
2882                     len2, s2,
2883                     (int)(tlen > 19 ? 0 :  19 - tlen),
2884                     "");
2885     }
2886 }
2887
2888 #endif
2889
2890 /* reg_check_named_buff_matched()
2891  * Checks to see if a named buffer has matched. The data array of
2892  * buffer numbers corresponding to the buffer is expected to reside
2893  * in the regexp->data->data array in the slot stored in the ARG() of
2894  * node involved. Note that this routine doesn't actually care about the
2895  * name, that information is not preserved from compilation to execution.
2896  * Returns the index of the leftmost defined buffer with the given name
2897  * or 0 if non of the buffers matched.
2898  */
2899 STATIC I32
2900 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2901 {
2902     I32 n;
2903     RXi_GET_DECL(rex,rexi);
2904     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2905     I32 *nums=(I32*)SvPVX(sv_dat);
2906
2907     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2908
2909     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2910         if ((I32)*PL_reglastparen >= nums[n] &&
2911             PL_regoffs[nums[n]].end != -1)
2912         {
2913             return nums[n];
2914         }
2915     }
2916     return 0;
2917 }
2918
2919
2920 /* free all slabs above current one  - called during LEAVE_SCOPE */
2921
2922 STATIC void
2923 S_clear_backtrack_stack(pTHX_ void *p)
2924 {
2925     regmatch_slab *s = PL_regmatch_slab->next;
2926     PERL_UNUSED_ARG(p);
2927
2928     if (!s)
2929         return;
2930     PL_regmatch_slab->next = NULL;
2931     while (s) {
2932         regmatch_slab * const osl = s;
2933         s = s->next;
2934         Safefree(osl);
2935     }
2936 }
2937
2938
2939 #define SETREX(Re1,Re2) \
2940     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2941     Re1 = (Re2)
2942
2943 STATIC I32                      /* 0 failure, 1 success */
2944 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2945 {
2946 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2947     dMY_CXT;
2948 #endif
2949     dVAR;
2950     register const bool utf8_target = PL_reg_match_utf8;
2951     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2952     REGEXP *rex_sv = reginfo->prog;
2953     regexp *rex = (struct regexp *)SvANY(rex_sv);
2954     RXi_GET_DECL(rex,rexi);
2955     I32 oldsave;
2956     /* the current state. This is a cached copy of PL_regmatch_state */
2957     register regmatch_state *st;
2958     /* cache heavy used fields of st in registers */
2959     register regnode *scan;
2960     register regnode *next;
2961     register U32 n = 0; /* general value; init to avoid compiler warning */
2962     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2963     register char *locinput = PL_reginput;
2964     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2965
2966     bool result = 0;        /* return value of S_regmatch */
2967     int depth = 0;          /* depth of backtrack stack */
2968     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2969     const U32 max_nochange_depth =
2970         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2971         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2972     regmatch_state *yes_state = NULL; /* state to pop to on success of
2973                                                             subpattern */
2974     /* mark_state piggy backs on the yes_state logic so that when we unwind
2975        the stack on success we can update the mark_state as we go */
2976     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2977     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2978     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2979     U32 state_num;
2980     bool no_final = 0;      /* prevent failure from backtracking? */
2981     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2982     char *startpoint = PL_reginput;
2983     SV *popmark = NULL;     /* are we looking for a mark? */
2984     SV *sv_commit = NULL;   /* last mark name seen in failure */
2985     SV *sv_yes_mark = NULL; /* last mark name we have seen
2986                                during a successfull match */
2987     U32 lastopen = 0;       /* last open we saw */
2988     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2989     SV* const oreplsv = GvSV(PL_replgv);
2990     /* these three flags are set by various ops to signal information to
2991      * the very next op. They have a useful lifetime of exactly one loop
2992      * iteration, and are not preserved or restored by state pushes/pops
2993      */
2994     bool sw = 0;            /* the condition value in (?(cond)a|b) */
2995     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
2996     int logical = 0;        /* the following EVAL is:
2997                                 0: (?{...})
2998                                 1: (?(?{...})X|Y)
2999                                 2: (??{...})
3000                                or the following IFMATCH/UNLESSM is:
3001                                 false: plain (?=foo)
3002                                 true:  used as a condition: (?(?=foo))
3003                             */
3004 #ifdef DEBUGGING
3005     GET_RE_DEBUG_FLAGS_DECL;
3006 #endif
3007
3008     PERL_ARGS_ASSERT_REGMATCH;
3009
3010     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3011             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3012     }));
3013     /* on first ever call to regmatch, allocate first slab */
3014     if (!PL_regmatch_slab) {
3015         Newx(PL_regmatch_slab, 1, regmatch_slab);
3016         PL_regmatch_slab->prev = NULL;
3017         PL_regmatch_slab->next = NULL;
3018         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3019     }
3020
3021     oldsave = PL_savestack_ix;
3022     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3023     SAVEVPTR(PL_regmatch_slab);
3024     SAVEVPTR(PL_regmatch_state);
3025
3026     /* grab next free state slot */
3027     st = ++PL_regmatch_state;
3028     if (st >  SLAB_LAST(PL_regmatch_slab))
3029         st = PL_regmatch_state = S_push_slab(aTHX);
3030
3031     /* Note that nextchr is a byte even in UTF */
3032     nextchr = UCHARAT(locinput);
3033     scan = prog;
3034     while (scan != NULL) {
3035
3036         DEBUG_EXECUTE_r( {
3037             SV * const prop = sv_newmortal();
3038             regnode *rnext=regnext(scan);
3039             DUMP_EXEC_POS( locinput, scan, utf8_target );
3040             regprop(rex, prop, scan);
3041
3042             PerlIO_printf(Perl_debug_log,
3043                     "%3"IVdf":%*s%s(%"IVdf")\n",
3044                     (IV)(scan - rexi->program), depth*2, "",
3045                     SvPVX_const(prop),
3046                     (PL_regkind[OP(scan)] == END || !rnext) ?
3047                         0 : (IV)(rnext - rexi->program));
3048         });
3049
3050         next = scan + NEXT_OFF(scan);
3051         if (next == scan)
3052             next = NULL;
3053         state_num = OP(scan);
3054
3055       reenter_switch:
3056
3057         assert(PL_reglastparen == &rex->lastparen);
3058         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3059         assert(PL_regoffs == rex->offs);
3060
3061         switch (state_num) {
3062         case BOL:
3063             if (locinput == PL_bostr)
3064             {
3065                 /* reginfo->till = reginfo->bol; */
3066                 break;
3067             }
3068             sayNO;
3069         case MBOL:
3070             if (locinput == PL_bostr ||
3071                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3072             {
3073                 break;
3074             }
3075             sayNO;
3076         case SBOL:
3077             if (locinput == PL_bostr)
3078                 break;
3079             sayNO;
3080         case GPOS:
3081             if (locinput == reginfo->ganch)
3082                 break;
3083             sayNO;
3084
3085         case KEEPS:
3086             /* update the startpoint */
3087             st->u.keeper.val = PL_regoffs[0].start;
3088             PL_reginput = locinput;
3089             PL_regoffs[0].start = locinput - PL_bostr;
3090             PUSH_STATE_GOTO(KEEPS_next, next);
3091             /*NOT-REACHED*/
3092         case KEEPS_next_fail:
3093             /* rollback the start point change */
3094             PL_regoffs[0].start = st->u.keeper.val;
3095             sayNO_SILENT;
3096             /*NOT-REACHED*/
3097         case EOL:
3098                 goto seol;
3099         case MEOL:
3100             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3101                 sayNO;
3102             break;
3103         case SEOL:
3104           seol:
3105             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3106                 sayNO;
3107             if (PL_regeol - locinput > 1)
3108                 sayNO;
3109             break;
3110         case EOS:
3111             if (PL_regeol != locinput)
3112                 sayNO;
3113             break;
3114         case SANY:
3115             if (!nextchr && locinput >= PL_regeol)
3116                 sayNO;
3117             if (utf8_target) {
3118                 locinput += PL_utf8skip[nextchr];
3119                 if (locinput > PL_regeol)
3120                     sayNO;
3121                 nextchr = UCHARAT(locinput);
3122             }
3123             else
3124                 nextchr = UCHARAT(++locinput);
3125             break;
3126         case CANY:
3127             if (!nextchr && locinput >= PL_regeol)
3128                 sayNO;
3129             nextchr = UCHARAT(++locinput);
3130             break;
3131         case REG_ANY:
3132             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3133                 sayNO;
3134             if (utf8_target) {
3135                 locinput += PL_utf8skip[nextchr];
3136                 if (locinput > PL_regeol)
3137                     sayNO;
3138                 nextchr = UCHARAT(locinput);
3139             }
3140             else
3141                 nextchr = UCHARAT(++locinput);
3142             break;
3143
3144 #undef  ST
3145 #define ST st->u.trie
3146         case TRIEC:
3147             /* In this case the charclass data is available inline so
3148                we can fail fast without a lot of extra overhead.
3149              */
3150             if (scan->flags == EXACT || !utf8_target) {
3151                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3152                     DEBUG_EXECUTE_r(
3153                         PerlIO_printf(Perl_debug_log,
3154                                   "%*s  %sfailed to match trie start class...%s\n",
3155                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3156                     );
3157                     sayNO_SILENT;
3158                     /* NOTREACHED */
3159                 }
3160             }
3161             /* FALL THROUGH */
3162         case TRIE:
3163             /* the basic plan of execution of the trie is:
3164              * At the beginning, run though all the states, and
3165              * find the longest-matching word. Also remember the position
3166              * of the shortest matching word. For example, this pattern:
3167              *    1  2 3 4    5
3168              *    ab|a|x|abcd|abc
3169              * when matched against the string "abcde", will generate
3170              * accept states for all words except 3, with the longest
3171              * matching word being 4, and the shortest being 1 (with
3172              * the position being after char 1 of the string).
3173              *
3174              * Then for each matching word, in word order (i.e. 1,2,4,5),
3175              * we run the remainder of the pattern; on each try setting
3176              * the current position to the character following the word,
3177              * returning to try the next word on failure.
3178              *
3179              * We avoid having to build a list of words at runtime by
3180              * using a compile-time structure, wordinfo[].prev, which
3181              * gives, for each word, the previous accepting word (if any).
3182              * In the case above it would contain the mappings 1->2, 2->0,
3183              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3184              * the longest word (4 above), a list of all words, by
3185              * following the list of prev pointers; this gives us the
3186              * unordered list 4,5,1,2. Then given the current word we have
3187              * just tried, we can go through the list and find the
3188              * next-biggest word to try (so if we just failed on word 2,
3189              * the next in the list is 4).
3190              *
3191              * Since at runtime we don't record the matching position in
3192              * the string for each word, we have to work that out for
3193              * each word we're about to process. The wordinfo table holds
3194              * the character length of each word; given that we recorded
3195              * at the start: the position of the shortest word and its
3196              * length in chars, we just need to move the pointer the
3197              * difference between the two char lengths. Depending on
3198              * Unicode status and folding, that's cheap or expensive.
3199              *
3200              * This algorithm is optimised for the case where are only a
3201              * small number of accept states, i.e. 0,1, or maybe 2.
3202              * With lots of accepts states, and having to try all of them,
3203              * it becomes quadratic on number of accept states to find all
3204              * the next words.
3205              */
3206
3207             {
3208                 /* what type of TRIE am I? (utf8 makes this contextual) */
3209                 DECL_TRIE_TYPE(scan);
3210
3211                 /* what trie are we using right now */
3212                 reg_trie_data * const trie
3213                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3214                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3215                 U32 state = trie->startstate;
3216
3217                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3218                     !TRIE_BITMAP_TEST(trie,*locinput)
3219                 ) {
3220                     if (trie->states[ state ].wordnum) {
3221                          DEBUG_EXECUTE_r(
3222                             PerlIO_printf(Perl_debug_log,
3223                                           "%*s  %smatched empty string...%s\n",
3224                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3225                         );
3226                         if (!trie->jump)
3227                             break;
3228                     } else {
3229                         DEBUG_EXECUTE_r(
3230                             PerlIO_printf(Perl_debug_log,
3231                                           "%*s  %sfailed to match trie start class...%s\n",
3232                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3233                         );
3234                         sayNO_SILENT;
3235                    }
3236                 }
3237
3238             {
3239                 U8 *uc = ( U8* )locinput;
3240
3241                 STRLEN len = 0;
3242                 STRLEN foldlen = 0;
3243                 U8 *uscan = (U8*)NULL;
3244                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3245                 U32 charcount = 0; /* how many input chars we have matched */
3246                 U32 accepted = 0; /* have we seen any accepting states? */
3247
3248                 ST.B = next;
3249                 ST.jump = trie->jump;
3250                 ST.me = scan;
3251                 ST.firstpos = NULL;
3252                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3253                 ST.nextword = 0;
3254
3255                 /* fully traverse the TRIE; note the position of the
3256                    shortest accept state and the wordnum of the longest
3257                    accept state */
3258
3259                 while ( state && uc <= (U8*)PL_regeol ) {
3260                     U32 base = trie->states[ state ].trans.base;
3261                     UV uvc = 0;
3262                     U16 charid = 0;
3263                     U16 wordnum;
3264                     wordnum = trie->states[ state ].wordnum;
3265
3266                     if (wordnum) { /* it's an accept state */
3267                         if (!accepted) {
3268                             accepted = 1;
3269                             /* record first match position */
3270                             if (ST.longfold) {
3271                                 ST.firstpos = (U8*)locinput;
3272                                 ST.firstchars = 0;
3273                             }
3274                             else {
3275                                 ST.firstpos = uc;
3276                                 ST.firstchars = charcount;
3277                             }
3278                         }
3279                         if (!ST.nextword || wordnum < ST.nextword)
3280                             ST.nextword = wordnum;
3281                         ST.topword = wordnum;
3282                     }
3283
3284                     DEBUG_TRIE_EXECUTE_r({
3285                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3286                                 PerlIO_printf( Perl_debug_log,
3287                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3288                                     2+depth * 2, "", PL_colors[4],
3289                                     (UV)state, (accepted ? 'Y' : 'N'));
3290                     });
3291
3292                     /* read a char and goto next state */
3293                     if ( base ) {
3294                         I32 offset;
3295                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3296                                              uscan, len, uvc, charid, foldlen,
3297                                              foldbuf, uniflags);
3298                         charcount++;
3299                         if (foldlen>0)
3300                             ST.longfold = TRUE;
3301                         if (charid &&
3302                              ( ((offset =
3303                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3304
3305                              && ((U32)offset < trie->lasttrans)
3306                              && trie->trans[offset].check == state)
3307                         {
3308                             state = trie->trans[offset].next;
3309                         }
3310                         else {
3311                             state = 0;
3312                         }
3313                         uc += len;
3314
3315                     }
3316                     else {
3317                         state = 0;
3318                     }
3319                     DEBUG_TRIE_EXECUTE_r(
3320                         PerlIO_printf( Perl_debug_log,
3321                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3322                             charid, uvc, (UV)state, PL_colors[5] );
3323                     );
3324                 }
3325                 if (!accepted)
3326                    sayNO;
3327
3328                 /* calculate total number of accept states */
3329                 {
3330                     U16 w = ST.topword;
3331                     accepted = 0;
3332                     while (w) {
3333                         w = trie->wordinfo[w].prev;
3334                         accepted++;
3335                     }
3336                     ST.accepted = accepted;
3337                 }
3338
3339                 DEBUG_EXECUTE_r(
3340                     PerlIO_printf( Perl_debug_log,
3341                         "%*s  %sgot %"IVdf" possible matches%s\n",
3342                         REPORT_CODE_OFF + depth * 2, "",
3343                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3344                 );
3345                 goto trie_first_try; /* jump into the fail handler */
3346             }}
3347             /* NOTREACHED */
3348
3349         case TRIE_next_fail: /* we failed - try next alternative */
3350             if ( ST.jump) {
3351                 REGCP_UNWIND(ST.cp);
3352                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3353                     PL_regoffs[n].end = -1;
3354                 *PL_reglastparen = n;
3355             }
3356             if (!--ST.accepted) {
3357                 DEBUG_EXECUTE_r({
3358                     PerlIO_printf( Perl_debug_log,
3359                         "%*s  %sTRIE failed...%s\n",
3360                         REPORT_CODE_OFF+depth*2, "",
3361                         PL_colors[4],
3362                         PL_colors[5] );
3363                 });
3364                 sayNO_SILENT;
3365             }
3366             {
3367                 /* Find next-highest word to process.  Note that this code
3368                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3369                 register U16 min = 0;
3370                 register U16 word;
3371                 register U16 const nextword = ST.nextword;
3372                 register reg_trie_wordinfo * const wordinfo
3373                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3374                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3375                     if (word > nextword && (!min || word < min))
3376                         min = word;
3377                 }
3378                 ST.nextword = min;
3379             }
3380
3381           trie_first_try:
3382             if (do_cutgroup) {
3383                 do_cutgroup = 0;
3384                 no_final = 0;
3385             }
3386
3387             if ( ST.jump) {
3388                 ST.lastparen = *PL_reglastparen;
3389                 REGCP_SET(ST.cp);
3390             }
3391
3392             /* find start char of end of current word */
3393             {
3394                 U32 chars; /* how many chars to skip */
3395                 U8 *uc = ST.firstpos;
3396                 reg_trie_data * const trie
3397                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3398
3399                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3400                             >=  ST.firstchars);
3401                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3402                             - ST.firstchars;
3403
3404                 if (ST.longfold) {
3405                     /* the hard option - fold each char in turn and find
3406                      * its folded length (which may be different */
3407                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3408                     STRLEN foldlen;
3409                     STRLEN len;
3410                     UV uvc;
3411                     U8 *uscan;
3412
3413                     while (chars) {
3414                         if (utf8_target) {
3415                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3416                                                     uniflags);
3417                             uc += len;
3418                         }
3419                         else {
3420                             uvc = *uc;
3421                             uc++;
3422                         }
3423                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3424                         uscan = foldbuf;
3425                         while (foldlen) {
3426                             if (!--chars)
3427                                 break;
3428                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3429                                             uniflags);
3430                             uscan += len;
3431                             foldlen -= len;
3432                         }
3433                     }
3434                 }
3435                 else {
3436                     if (utf8_target)
3437                         while (chars--)
3438                             uc += UTF8SKIP(uc);
3439                     else
3440                         uc += chars;
3441                 }
3442                 PL_reginput = (char *)uc;
3443             }
3444
3445             scan = (ST.jump && ST.jump[ST.nextword])
3446                         ? ST.me + ST.jump[ST.nextword]
3447                         : ST.B;
3448
3449             DEBUG_EXECUTE_r({
3450                 PerlIO_printf( Perl_debug_log,
3451                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3452                     REPORT_CODE_OFF+depth*2, "",
3453                     PL_colors[4],
3454                     ST.nextword,
3455                     PL_colors[5]
3456                     );
3457             });
3458
3459             if (ST.accepted > 1 || has_cutgroup) {
3460                 PUSH_STATE_GOTO(TRIE_next, scan);
3461                 /* NOTREACHED */
3462             }
3463             /* only one choice left - just continue */
3464             DEBUG_EXECUTE_r({
3465                 AV *const trie_words
3466                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3467                 SV ** const tmp = av_fetch( trie_words,
3468                     ST.nextword-1, 0 );
3469                 SV *sv= tmp ? sv_newmortal() : NULL;
3470
3471                 PerlIO_printf( Perl_debug_log,
3472                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3473                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3474                     ST.nextword,
3475                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3476                             PL_colors[0], PL_colors[1],
3477                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3478                         )
3479                     : "not compiled under -Dr",
3480                     PL_colors[5] );
3481             });
3482
3483             locinput = PL_reginput;
3484             nextchr = UCHARAT(locinput);
3485             continue; /* execute rest of RE */
3486             /* NOTREACHED */
3487 #undef  ST
3488
3489         case EXACT: {
3490             char *s = STRING(scan);
3491             ln = STR_LEN(scan);
3492             if (utf8_target != UTF_PATTERN) {
3493                 /* The target and the pattern have differing utf8ness. */
3494                 char *l = locinput;
3495                 const char * const e = s + ln;
3496
3497                 if (utf8_target) {
3498                     /* The target is utf8, the pattern is not utf8. */
3499                     while (s < e) {
3500                         STRLEN ulen;
3501                         if (l >= PL_regeol)
3502                              sayNO;
3503                         if (NATIVE_TO_UNI(*(U8*)s) !=
3504                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3505                                             uniflags))
3506                              sayNO;
3507                         l += ulen;
3508                         s ++;
3509                     }
3510                 }
3511                 else {
3512                     /* The target is not utf8, the pattern is utf8. */
3513                     while (s < e) {
3514                         STRLEN ulen;
3515                         if (l >= PL_regeol)
3516                             sayNO;
3517                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3518                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3519                                            uniflags))
3520                             sayNO;
3521                         s += ulen;
3522                         l ++;
3523                     }
3524                 }
3525                 locinput = l;
3526                 nextchr = UCHARAT(locinput);
3527                 break;
3528             }
3529             /* The target and the pattern have the same utf8ness. */
3530             /* Inline the first character, for speed. */
3531             if (UCHARAT(s) != nextchr)
3532                 sayNO;
3533             if (PL_regeol - locinput < ln)
3534                 sayNO;
3535             if (ln > 1 && memNE(s, locinput, ln))
3536                 sayNO;
3537             locinput += ln;
3538             nextchr = UCHARAT(locinput);
3539             break;
3540             }
3541         case EXACTFL:
3542             PL_reg_flags |= RF_tainted;
3543             /* FALL THROUGH */
3544         case EXACTF: {
3545             char * const s = STRING(scan);
3546             ln = STR_LEN(scan);
3547
3548             if (utf8_target || UTF_PATTERN) {
3549               /* Either target or the pattern are utf8. */
3550                 const char * const l = locinput;
3551                 char *e = PL_regeol;
3552
3553                 if (! foldEQ_utf8(s, 0,  ln, cBOOL(UTF_PATTERN),
3554                                l, &e, 0,  utf8_target)) {
3555                      /* One more case for the sharp s:
3556                       * pack("U0U*", 0xDF) =~ /ss/i,
3557                       * the 0xC3 0x9F are the UTF-8
3558                       * byte sequence for the U+00DF. */
3559
3560                      if (!(utf8_target &&
3561                            toLOWER(s[0]) == 's' &&
3562                            ln >= 2 &&
3563                            toLOWER(s[1]) == 's' &&
3564                            (U8)l[0] == 0xC3 &&
3565                            e - l >= 2 &&
3566                            (U8)l[1] == 0x9F))
3567                           sayNO;
3568                 }
3569                 locinput = e;
3570                 nextchr = UCHARAT(locinput);
3571                 break;
3572             }
3573
3574             /* Neither the target and the pattern are utf8. */
3575
3576             /* Inline the first character, for speed. */
3577             if (UCHARAT(s) != nextchr &&
3578                 UCHARAT(s) != ((OP(scan) == EXACTF)
3579                                ? PL_fold : PL_fold_locale)[nextchr])
3580                 sayNO;
3581             if (PL_regeol - locinput < ln)
3582                 sayNO;
3583             if (ln > 1 && (OP(scan) == EXACTF
3584                            ? ! foldEQ(s, locinput, ln)
3585                            : ! foldEQ_locale(s, locinput, ln)))
3586                 sayNO;
3587             locinput += ln;
3588             nextchr = UCHARAT(locinput);
3589             break;
3590             }
3591         case BOUNDL:
3592         case NBOUNDL:
3593             PL_reg_flags |= RF_tainted;
3594             /* FALL THROUGH */
3595         case BOUND:
3596         case NBOUND:
3597             /* was last char in word? */
3598             if (utf8_target) {
3599                 if (locinput == PL_bostr)
3600                     ln = '\n';
3601                 else {
3602                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3603
3604                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3605                 }
3606                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3607                     ln = isALNUM_uni(ln);
3608                     LOAD_UTF8_CHARCLASS_ALNUM();
3609                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3610                 }
3611                 else {
3612                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3613                     n = isALNUM_LC_utf8((U8*)locinput);
3614                 }
3615             }
3616             else {
3617                 ln = (locinput != PL_bostr) ?
3618                     UCHARAT(locinput - 1) : '\n';
3619                 if (FLAGS(scan) & USE_UNI) {
3620
3621                     /* Here, can't be BOUNDL or NBOUNDL because they never set
3622                      * the flags to USE_UNI */
3623                     ln = isWORDCHAR_L1(ln);
3624                     n = isWORDCHAR_L1(nextchr);
3625                 }
3626                 else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3627                     ln = isALNUM(ln);
3628                     n = isALNUM(nextchr);
3629                 }
3630                 else {
3631                     ln = isALNUM_LC(ln);
3632                     n = isALNUM_LC(nextchr);
3633                 }
3634             }
3635             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3636                                     OP(scan) == BOUNDL))
3637                     sayNO;
3638             break;
3639         case ANYOF:
3640             if (utf8_target) {
3641                 STRLEN inclasslen = PL_regeol - locinput;
3642                 if (locinput >= PL_regeol)
3643                     sayNO;
3644
3645                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3646                     goto anyof_fail;
3647                 locinput += inclasslen;
3648                 nextchr = UCHARAT(locinput);
3649                 break;
3650             }
3651             else {
3652                 if (nextchr < 0)
3653                     nextchr = UCHARAT(locinput);
3654                 if (!nextchr && locinput >= PL_regeol)
3655                     sayNO;
3656                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3657                     goto anyof_fail;
3658                 nextchr = UCHARAT(++locinput);
3659                 break;
3660             }
3661         anyof_fail:
3662             /* If we might have the case of the German sharp s
3663              * in a casefolding Unicode character class. */
3664
3665             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3666                  locinput += SHARP_S_SKIP;
3667                  nextchr = UCHARAT(locinput);
3668             }
3669             else
3670                  sayNO;
3671             break;
3672         /* Special char classes - The defines start on line 129 or so */
3673         CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3674         CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3675
3676         CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3677         CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3678
3679         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3680         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3681
3682         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3683                        a Unicode extended Grapheme Cluster */
3684             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3685               extended Grapheme Cluster is:
3686
3687                CR LF
3688                | Prepend* Begin Extend*
3689                | .
3690
3691                Begin is (Hangul-syllable | ! Control)
3692                Extend is (Grapheme_Extend | Spacing_Mark)
3693                Control is [ GCB_Control CR LF ]
3694
3695                The discussion below shows how the code for CLUMP is derived
3696                from this regex.  Note that most of these concepts are from
3697                property values of the Grapheme Cluster Boundary (GCB) property.
3698                No code point can have multiple property values for a given
3699                property.  Thus a code point in Prepend can't be in Control, but
3700                it must be in !Control.  This is why Control above includes
3701                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3702                property separately, and so can't be in GCB_Control, even though
3703                they logically are controls.  Control is not the same as gc=cc,
3704                but includes format and other characters as well.
3705
3706                The Unicode definition of Hangul-syllable is:
3707                    L+
3708                    | (L* ( ( V | LV ) V* | LVT ) T*)
3709                    | T+
3710                   )
3711                Each of these is a value for the GCB property, and hence must be
3712                disjoint, so the order they are tested is immaterial, so the
3713                above can safely be changed to
3714                    T+
3715                    | L+
3716                    | (L* ( LVT | ( V | LV ) V*) T*)
3717
3718                The last two terms can be combined like this:
3719                    L* ( L
3720                         | (( LVT | ( V | LV ) V*) T*))
3721
3722                And refactored into this:
3723                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3724
3725                That means that if we have seen any L's at all we can quit
3726                there, but if the next character is a LVT, a V or and LV we
3727                should keep going.
3728
3729                There is a subtlety with Prepend* which showed up in testing.
3730                Note that the Begin, and only the Begin is required in:
3731                 | Prepend* Begin Extend*
3732                Also, Begin contains '! Control'.  A Prepend must be a '!
3733                Control', which means it must be a Begin.  What it comes down to
3734                is that if we match Prepend* and then find no suitable Begin
3735                afterwards, that if we backtrack the last Prepend, that one will
3736                be a suitable Begin.
3737             */
3738
3739             if (locinput >= PL_regeol)
3740                 sayNO;
3741             if  (! utf8_target) {
3742
3743                 /* Match either CR LF  or '.', as all the other possibilities
3744                  * require utf8 */
3745                 locinput++;         /* Match the . or CR */
3746                 if (nextchr == '\r'
3747                     && locinput < PL_regeol
3748                     && UCHARAT(locinput) == '\n') locinput++;
3749             }
3750             else {
3751
3752                 /* Utf8: See if is ( CR LF ); already know that locinput <
3753                  * PL_regeol, so locinput+1 is in bounds */
3754                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3755                     locinput += 2;
3756                 }
3757                 else {
3758                     /* In case have to backtrack to beginning, then match '.' */
3759                     char *starting = locinput;
3760
3761                     /* In case have to backtrack the last prepend */
3762                     char *previous_prepend = 0;
3763
3764                     LOAD_UTF8_CHARCLASS_GCB();
3765
3766                     /* Match (prepend)* */
3767                     while (locinput < PL_regeol
3768                            && swash_fetch(PL_utf8_X_prepend,
3769                                           (U8*)locinput, utf8_target))
3770                     {
3771                         previous_prepend = locinput;
3772                         locinput += UTF8SKIP(locinput);
3773                     }
3774
3775                     /* As noted above, if we matched a prepend character, but
3776                      * the next thing won't match, back off the last prepend we
3777                      * matched, as it is guaranteed to match the begin */
3778                     if (previous_prepend
3779                         && (locinput >=  PL_regeol
3780                             || ! swash_fetch(PL_utf8_X_begin,
3781                                              (U8*)locinput, utf8_target)))
3782                     {
3783                         locinput = previous_prepend;
3784                     }
3785
3786                     /* Note that here we know PL_regeol > locinput, as we
3787                      * tested that upon input to this switch case, and if we
3788                      * moved locinput forward, we tested the result just above
3789                      * and it either passed, or we backed off so that it will
3790                      * now pass */
3791                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3792
3793                         /* Here did not match the required 'Begin' in the
3794                          * second term.  So just match the very first
3795                          * character, the '.' of the final term of the regex */
3796                         locinput = starting + UTF8SKIP(starting);
3797                     } else {
3798
3799                         /* Here is the beginning of a character that can have
3800                          * an extender.  It is either a hangul syllable, or a
3801                          * non-control */
3802                         if (swash_fetch(PL_utf8_X_non_hangul,
3803                                         (U8*)locinput, utf8_target))
3804                         {
3805
3806                             /* Here not a Hangul syllable, must be a
3807                              * ('!  * Control') */
3808                             locinput += UTF8SKIP(locinput);
3809                         } else {
3810
3811                             /* Here is a Hangul syllable.  It can be composed
3812                              * of several individual characters.  One
3813                              * possibility is T+ */
3814                             if (swash_fetch(PL_utf8_X_T,
3815                                             (U8*)locinput, utf8_target))
3816                             {
3817                                 while (locinput < PL_regeol
3818                                         && swash_fetch(PL_utf8_X_T,
3819                                                         (U8*)locinput, utf8_target))
3820                                 {
3821                                     locinput += UTF8SKIP(locinput);
3822                                 }
3823                             } else {
3824
3825                                 /* Here, not T+, but is a Hangul.  That means
3826                                  * it is one of the others: L, LV, LVT or V,
3827                                  * and matches:
3828                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3829
3830                                 /* Match L*           */
3831                                 while (locinput < PL_regeol
3832                                         && swash_fetch(PL_utf8_X_L,
3833                                                         (U8*)locinput, utf8_target))
3834                                 {
3835                                     locinput += UTF8SKIP(locinput);
3836                                 }
3837
3838                                 /* Here, have exhausted L*.  If the next
3839                                  * character is not an LV, LVT nor V, it means
3840                                  * we had to have at least one L, so matches L+
3841                                  * in the original equation, we have a complete
3842                                  * hangul syllable.  Are done. */
3843
3844                                 if (locinput < PL_regeol
3845                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3846                                                     (U8*)locinput, utf8_target))
3847                                 {
3848
3849                                     /* Otherwise keep going.  Must be LV, LVT
3850                                      * or V.  See if LVT */
3851                                     if (swash_fetch(PL_utf8_X_LVT,
3852                                                     (U8*)locinput, utf8_target))
3853                                     {
3854                                         locinput += UTF8SKIP(locinput);
3855                                     } else {
3856
3857                                         /* Must be  V or LV.  Take it, then
3858                                          * match V*     */
3859                                         locinput += UTF8SKIP(locinput);
3860                                         while (locinput < PL_regeol
3861                                                 && swash_fetch(PL_utf8_X_V,
3862                                                          (U8*)locinput, utf8_target))
3863                                         {
3864                                             locinput += UTF8SKIP(locinput);
3865                                         }
3866                                     }
3867
3868                                     /* And any of LV, LVT, or V can be followed
3869                                      * by T*            */
3870                                     while (locinput < PL_regeol
3871                                            && swash_fetch(PL_utf8_X_T,
3872                                                            (U8*)locinput,
3873                                                            utf8_target))
3874                                     {
3875                                         locinput += UTF8SKIP(locinput);
3876                                     }
3877                                 }
3878                             }
3879                         }
3880
3881                         /* Match any extender */
3882                         while (locinput < PL_regeol
3883                                 && swash_fetch(PL_utf8_X_extend,
3884                                                 (U8*)locinput, utf8_target))
3885                         {
3886                             locinput += UTF8SKIP(locinput);
3887                         }
3888                     }
3889                 }
3890                 if (locinput > PL_regeol) sayNO;
3891             }
3892             nextchr = UCHARAT(locinput);
3893             break;
3894
3895         case NREFFL:
3896         {
3897             char *s;
3898             char type;
3899             PL_reg_flags |= RF_tainted;
3900             /* FALL THROUGH */
3901         case NREF:
3902         case NREFF:
3903             type = OP(scan);
3904             n = reg_check_named_buff_matched(rex,scan);
3905
3906             if ( n ) {
3907                 type = REF + ( type - NREF );
3908                 goto do_ref;
3909             } else {
3910                 sayNO;
3911             }
3912             /* unreached */
3913         case REFFL:
3914             PL_reg_flags |= RF_tainted;
3915             /* FALL THROUGH */
3916         case REF:
3917         case REFF:
3918             n = ARG(scan);  /* which paren pair */
3919             type = OP(scan);
3920           do_ref:
3921             ln = PL_regoffs[n].start;
3922             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
3923             if (*PL_reglastparen < n || ln == -1)
3924                 sayNO;                  /* Do not match unless seen CLOSEn. */
3925             if (ln == PL_regoffs[n].end)
3926                 break;
3927
3928             s = PL_bostr + ln;
3929             if (utf8_target && type != REF) {   /* REF can do byte comparison */
3930                 char *l = locinput;
3931                 const char *e = PL_bostr + PL_regoffs[n].end;
3932                 /*
3933                  * Note that we can't do the "other character" lookup trick as
3934                  * in the 8-bit case (no pun intended) because in Unicode we
3935                  * have to map both upper and title case to lower case.
3936                  */
3937                 if (type == REFF) {
3938                     while (s < e) {
3939                         STRLEN ulen1, ulen2;
3940                         U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3941                         U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3942
3943                         if (l >= PL_regeol)
3944                             sayNO;
3945                         toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3946                         toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3947                         if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3948                             sayNO;
3949                         s += ulen1;
3950                         l += ulen2;
3951                     }
3952                 }
3953                 locinput = l;
3954                 nextchr = UCHARAT(locinput);
3955                 break;
3956             }
3957
3958             /* Inline the first character, for speed. */
3959             if (UCHARAT(s) != nextchr &&
3960                 (type == REF ||
3961                  (UCHARAT(s) != (type == REFF
3962                                   ? PL_fold : PL_fold_locale)[nextchr])))
3963                 sayNO;
3964             ln = PL_regoffs[n].end - ln;
3965             if (locinput + ln > PL_regeol)
3966                 sayNO;
3967             if (ln > 1 && (type == REF
3968                            ? memNE(s, locinput, ln)
3969                            : (type == REFF
3970                               ? ! foldEQ(s, locinput, ln)
3971                               : ! foldEQ_locale(s, locinput, ln))))
3972                 sayNO;
3973             locinput += ln;
3974             nextchr = UCHARAT(locinput);
3975             break;
3976         }
3977         case NOTHING:
3978         case TAIL:
3979             break;
3980         case BACK:
3981             break;
3982
3983 #undef  ST
3984 #define ST st->u.eval
3985         {
3986             SV *ret;
3987             REGEXP *re_sv;
3988             regexp *re;
3989             regexp_internal *rei;
3990             regnode *startpoint;
3991
3992         case GOSTART:
3993         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3994             if (cur_eval && cur_eval->locinput==locinput) {
3995                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3996                     Perl_croak(aTHX_ "Infinite recursion in regex");
3997                 if ( ++nochange_depth > max_nochange_depth )
3998                     Perl_croak(aTHX_
3999                         "Pattern subroutine nesting without pos change"
4000                         " exceeded limit in regex");
4001             } else {
4002                 nochange_depth = 0;
4003             }
4004             re_sv = rex_sv;
4005             re = rex;
4006             rei = rexi;
4007             (void)ReREFCNT_inc(rex_sv);
4008             if (OP(scan)==GOSUB) {
4009                 startpoint = scan + ARG2L(scan);
4010                 ST.close_paren = ARG(scan);
4011             } else {
4012                 startpoint = rei->program+1;
4013                 ST.close_paren = 0;
4014             }
4015             goto eval_recurse_doit;
4016             /* NOTREACHED */
4017         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4018             if (cur_eval && cur_eval->locinput==locinput) {
4019                 if ( ++nochange_depth > max_nochange_depth )
4020                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4021             } else {
4022                 nochange_depth = 0;
4023             }
4024             {
4025                 /* execute the code in the {...} */
4026                 dSP;
4027                 SV ** const before = SP;
4028                 OP_4tree * const oop = PL_op;
4029                 COP * const ocurcop = PL_curcop;
4030                 PAD *old_comppad;
4031                 char *saved_regeol = PL_regeol;
4032                 struct re_save_state saved_state;
4033
4034                 /* To not corrupt the existing regex state while executing the
4035                  * eval we would normally put it on the save stack, like with
4036                  * save_re_context. However, re-evals have a weird scoping so we
4037                  * can't just add ENTER/LEAVE here. With that, things like
4038                  *
4039                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4040                  *
4041                  * would break, as they expect the localisation to be unwound
4042                  * only when the re-engine backtracks through the bit that
4043                  * localised it.
4044                  *
4045                  * What we do instead is just saving the state in a local c
4046                  * variable.
4047                  */
4048                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4049
4050                 n = ARG(scan);
4051                 PL_op = (OP_4tree*)rexi->data->data[n];
4052                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4053                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4054                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4055                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4056
4057                 if (sv_yes_mark) {
4058                     SV *sv_mrk = get_sv("REGMARK", 1);
4059                     sv_setsv(sv_mrk, sv_yes_mark);
4060                 }
4061
4062                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4063                 SPAGAIN;
4064                 if (SP == before)
4065                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4066                 else {
4067                     ret = POPs;
4068                     PUTBACK;
4069                 }
4070
4071                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4072
4073                 PL_op = oop;
4074                 PAD_RESTORE_LOCAL(old_comppad);
4075                 PL_curcop = ocurcop;
4076                 PL_regeol = saved_regeol;
4077                 if (!logical) {
4078                     /* /(?{...})/ */
4079                     sv_setsv(save_scalar(PL_replgv), ret);
4080                     break;
4081                 }
4082             }
4083             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4084                 logical = 0;
4085                 {
4086                     /* extract RE object from returned value; compiling if
4087                      * necessary */
4088                     MAGIC *mg = NULL;
4089                     REGEXP *rx = NULL;
4090
4091                     if (SvROK(ret)) {
4092                         SV *const sv = SvRV(ret);
4093
4094                         if (SvTYPE(sv) == SVt_REGEXP) {
4095                             rx = (REGEXP*) sv;
4096                         } else if (SvSMAGICAL(sv)) {
4097                             mg = mg_find(sv, PERL_MAGIC_qr);
4098                             assert(mg);
4099                         }
4100                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4101                         rx = (REGEXP*) ret;
4102                     } else if (SvSMAGICAL(ret)) {
4103                         if (SvGMAGICAL(ret)) {
4104                             /* I don't believe that there is ever qr magic
4105                                here.  */
4106                             assert(!mg_find(ret, PERL_MAGIC_qr));
4107                             sv_unmagic(ret, PERL_MAGIC_qr);
4108                         }
4109                         else {
4110                             mg = mg_find(ret, PERL_MAGIC_qr);
4111                             /* testing suggests mg only ends up non-NULL for
4112                                scalars who were upgraded and compiled in the
4113                                else block below. In turn, this is only
4114                                triggered in the "postponed utf8 string" tests
4115                                in t/op/pat.t  */
4116                         }
4117                     }
4118
4119                     if (mg) {
4120                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4121                         assert(rx);
4122                     }
4123                     if (rx) {
4124                         rx = reg_temp_copy(NULL, rx);
4125                     }
4126                     else {
4127                         U32 pm_flags = 0;
4128                         const I32 osize = PL_regsize;
4129
4130                         if (DO_UTF8(ret)) {
4131                             assert (SvUTF8(ret));
4132                         } else if (SvUTF8(ret)) {
4133                             /* Not doing UTF-8, despite what the SV says. Is
4134                                this only if we're trapped in use 'bytes'?  */
4135                             /* Make a copy of the octet sequence, but without
4136                                the flag on, as the compiler now honours the
4137                                SvUTF8 flag on ret.  */
4138                             STRLEN len;
4139                             const char *const p = SvPV(ret, len);
4140                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4141                         }
4142                         rx = CALLREGCOMP(ret, pm_flags);
4143                         if (!(SvFLAGS(ret)
4144                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4145                                  | SVs_GMG))) {
4146                             /* This isn't a first class regexp. Instead, it's
4147                                caching a regexp onto an existing, Perl visible
4148                                scalar.  */
4149                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4150                         }
4151                         PL_regsize = osize;
4152                     }
4153                     re_sv = rx;
4154                     re = (struct regexp *)SvANY(rx);
4155                 }
4156                 RXp_MATCH_COPIED_off(re);
4157                 re->subbeg = rex->subbeg;
4158                 re->sublen = rex->sublen;
4159                 rei = RXi_GET(re);
4160                 DEBUG_EXECUTE_r(
4161                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4162                         "Matching embedded");
4163                 );
4164                 startpoint = rei->program + 1;
4165                 ST.close_paren = 0; /* only used for GOSUB */
4166                 /* borrowed from regtry */
4167                 if (PL_reg_start_tmpl <= re->nparens) {
4168                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4169                     if(PL_reg_start_tmp)
4170                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4171                     else
4172                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4173                 }
4174
4175         eval_recurse_doit: /* Share code with GOSUB below this line */
4176                 /* run the pattern returned from (??{...}) */
4177                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4178                 REGCP_SET(ST.lastcp);
4179
4180                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4181
4182                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4183                 PL_reglastparen = &re->lastparen;
4184                 PL_reglastcloseparen = &re->lastcloseparen;
4185                 re->lastparen = 0;
4186                 re->lastcloseparen = 0;
4187
4188                 PL_reginput = locinput;
4189                 PL_regsize = 0;
4190
4191                 /* XXXX This is too dramatic a measure... */
4192                 PL_reg_maxiter = 0;
4193
4194                 ST.toggle_reg_flags = PL_reg_flags;
4195                 if (RX_UTF8(re_sv))
4196                     PL_reg_flags |= RF_utf8;
4197                 else
4198                     PL_reg_flags &= ~RF_utf8;
4199                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4200
4201                 ST.prev_rex = rex_sv;
4202                 ST.prev_curlyx = cur_curlyx;
4203                 SETREX(rex_sv,re_sv);
4204                 rex = re;
4205                 rexi = rei;
4206                 cur_curlyx = NULL;
4207                 ST.B = next;
4208                 ST.prev_eval = cur_eval;
4209                 cur_eval = st;
4210                 /* now continue from first node in postoned RE */
4211                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4212                 /* NOTREACHED */
4213             }
4214             /* logical is 1,   /(?(?{...})X|Y)/ */
4215             sw = cBOOL(SvTRUE(ret));
4216             logical = 0;
4217             break;
4218         }
4219
4220         case EVAL_AB: /* cleanup after a successful (??{A})B */
4221             /* note: this is called twice; first after popping B, then A */
4222             PL_reg_flags ^= ST.toggle_reg_flags;
4223             ReREFCNT_dec(rex_sv);
4224             SETREX(rex_sv,ST.prev_rex);
4225             rex = (struct regexp *)SvANY(rex_sv);
4226             rexi = RXi_GET(rex);
4227             regcpblow(ST.cp);
4228             cur_eval = ST.prev_eval;
4229             cur_curlyx = ST.prev_curlyx;
4230
4231             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4232             PL_reglastparen = &rex->lastparen;
4233             PL_reglastcloseparen = &rex->lastcloseparen;
4234             /* also update PL_regoffs */
4235             PL_regoffs = rex->offs;
4236
4237             /* XXXX This is too dramatic a measure... */
4238             PL_reg_maxiter = 0;
4239             if ( nochange_depth )
4240                 nochange_depth--;
4241             sayYES;
4242
4243
4244         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4245             /* note: this is called twice; first after popping B, then A */
4246             PL_reg_flags ^= ST.toggle_reg_flags;
4247             ReREFCNT_dec(rex_sv);
4248             SETREX(rex_sv,ST.prev_rex);
4249             rex = (struct regexp *)SvANY(rex_sv);
4250             rexi = RXi_GET(rex);
4251             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4252             PL_reglastparen = &rex->lastparen;
4253             PL_reglastcloseparen = &rex->lastcloseparen;
4254
4255             PL_reginput = locinput;
4256             REGCP_UNWIND(ST.lastcp);
4257             regcppop(rex);
4258             cur_eval = ST.prev_eval;
4259             cur_curlyx = ST.prev_curlyx;
4260             /* XXXX This is too dramatic a measure... */
4261             PL_reg_maxiter = 0;
4262             if ( nochange_depth )
4263                 nochange_depth--;
4264             sayNO_SILENT;
4265 #undef ST
4266
4267         case OPEN:
4268             n = ARG(scan);  /* which paren pair */
4269             PL_reg_start_tmp[n] = locinput;
4270             if (n > PL_regsize)
4271                 PL_regsize = n;
4272             lastopen = n;
4273             break;
4274         case CLOSE:
4275             n = ARG(scan);  /* which paren pair */
4276             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4277             PL_regoffs[n].end = locinput - PL_bostr;
4278             /*if (n > PL_regsize)
4279                 PL_regsize = n;*/
4280             if (n > *PL_reglastparen)
4281                 *PL_reglastparen = n;
4282             *PL_reglastcloseparen = n;
4283             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4284                 goto fake_end;
4285             }
4286             break;
4287         case ACCEPT:
4288             if (ARG(scan)){
4289                 regnode *cursor;
4290                 for (cursor=scan;
4291                      cursor && OP(cursor)!=END;
4292                      cursor=regnext(cursor))
4293                 {
4294                     if ( OP(cursor)==CLOSE ){
4295                         n = ARG(cursor);
4296                         if ( n <= lastopen ) {
4297                             PL_regoffs[n].start
4298                                 = PL_reg_start_tmp[n] - PL_bostr;
4299                             PL_regoffs[n].end = locinput - PL_bostr;
4300                             /*if (n > PL_regsize)
4301                             PL_regsize = n;*/
4302                             if (n > *PL_reglastparen)
4303                                 *PL_reglastparen = n;
4304                             *PL_reglastcloseparen = n;
4305                             if ( n == ARG(scan) || (cur_eval &&
4306                                 cur_eval->u.eval.close_paren == n))
4307                                 break;
4308                         }
4309                     }
4310                 }
4311             }
4312             goto fake_end;
4313             /*NOTREACHED*/
4314         case GROUPP:
4315             n = ARG(scan);  /* which paren pair */
4316             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4317             break;
4318         case NGROUPP:
4319             /* reg_check_named_buff_matched returns 0 for no match */
4320             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4321             break;
4322         case INSUBP:
4323             n = ARG(scan);
4324             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4325             break;
4326         case DEFINEP:
4327             sw = 0;
4328             break;
4329         case IFTHEN:
4330             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4331             if (sw)
4332                 next = NEXTOPER(NEXTOPER(scan));
4333             else {
4334                 next = scan + ARG(scan);
4335                 if (OP(next) == IFTHEN) /* Fake one. */
4336                     next = NEXTOPER(NEXTOPER(next));
4337             }
4338             break;
4339         case LOGICAL:
4340             logical = scan->flags;
4341             break;
4342
4343 /*******************************************************************
4344
4345 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4346 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4347 STAR/PLUS/CURLY/CURLYN are used instead.)
4348
4349 A*B is compiled as <CURLYX><A><WHILEM><B>
4350
4351 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4352 state, which contains the current count, initialised to -1. It also sets
4353 cur_curlyx to point to this state, with any previous value saved in the
4354 state block.
4355
4356 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4357 since the pattern may possibly match zero times (i.e. it's a while {} loop
4358 rather than a do {} while loop).
4359
4360 Each entry to WHILEM represents a successful match of A. The count in the
4361 CURLYX block is incremented, another WHILEM state is pushed, and execution
4362 passes to A or B depending on greediness and the current count.
4363
4364 For example, if matching against the string a1a2a3b (where the aN are
4365 substrings that match /A/), then the match progresses as follows: (the
4366 pushed states are interspersed with the bits of strings matched so far):
4367
4368     <CURLYX cnt=-1>
4369     <CURLYX cnt=0><WHILEM>
4370     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4371     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4372     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4373     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4374
4375 (Contrast this with something like CURLYM, which maintains only a single
4376 backtrack state:
4377
4378     <CURLYM cnt=0> a1
4379     a1 <CURLYM cnt=1> a2
4380     a1 a2 <CURLYM cnt=2> a3
4381     a1 a2 a3 <CURLYM cnt=3> b
4382 )
4383
4384 Each WHILEM state block marks a point to backtrack to upon partial failure
4385 of A or B, and also contains some minor state data related to that
4386 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4387 overall state, such as the count, and pointers to the A and B ops.
4388
4389 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4390 must always point to the *current* CURLYX block, the rules are:
4391
4392 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4393 and set cur_curlyx to point the new block.
4394
4395 When popping the CURLYX block after a successful or unsuccessful match,
4396 restore the previous cur_curlyx.
4397
4398 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4399 to the outer one saved in the CURLYX block.
4400
4401 When popping the WHILEM block after a successful or unsuccessful B match,
4402 restore the previous cur_curlyx.
4403
4404 Here's an example for the pattern (AI* BI)*BO
4405 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4406
4407 cur_
4408 curlyx backtrack stack
4409 ------ ---------------
4410 NULL
4411 CO     <CO prev=NULL> <WO>
4412 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4413 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4414 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4415
4416 At this point the pattern succeeds, and we work back down the stack to
4417 clean up, restoring as we go:
4418
4419 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4420 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4421 CO     <CO prev=NULL> <WO>
4422 NULL
4423
4424 *******************************************************************/
4425
4426 #define ST st->u.curlyx
4427
4428         case CURLYX:    /* start of /A*B/  (for complex A) */
4429         {
4430             /* No need to save/restore up to this paren */
4431             I32 parenfloor = scan->flags;
4432
4433             assert(next); /* keep Coverity happy */
4434             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4435                 next += ARG(next);
4436
4437             /* XXXX Probably it is better to teach regpush to support
4438                parenfloor > PL_regsize... */
4439             if (parenfloor > (I32)*PL_reglastparen)
4440                 parenfloor = *PL_reglastparen; /* Pessimization... */
4441
4442             ST.prev_curlyx= cur_curlyx;
4443             cur_curlyx = st;
4444             ST.cp = PL_savestack_ix;
4445
4446             /* these fields contain the state of the current curly.
4447              * they are accessed by subsequent WHILEMs */
4448             ST.parenfloor = parenfloor;
4449             ST.me = scan;
4450             ST.B = next;
4451             ST.minmod = minmod;
4452             minmod = 0;
4453             ST.count = -1;      /* this will be updated by WHILEM */
4454             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4455
4456             PL_reginput = locinput;
4457             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4458             /* NOTREACHED */
4459         }
4460
4461         case CURLYX_end: /* just finished matching all of A*B */
4462             cur_curlyx = ST.prev_curlyx;
4463             sayYES;
4464             /* NOTREACHED */
4465
4466         case CURLYX_end_fail: /* just failed to match all of A*B */
4467             regcpblow(ST.cp);
4468             cur_curlyx = ST.prev_curlyx;
4469             sayNO;
4470             /* NOTREACHED */
4471
4472
4473 #undef ST
4474 #define ST st->u.whilem
4475
4476         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4477         {
4478             /* see the discussion above about CURLYX/WHILEM */
4479             I32 n;
4480             int min = ARG1(cur_curlyx->u.curlyx.me);
4481             int max = ARG2(cur_curlyx->u.curlyx.me);
4482             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4483
4484             assert(cur_curlyx); /* keep Coverity happy */
4485             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4486             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4487             ST.cache_offset = 0;
4488             ST.cache_mask = 0;
4489
4490             PL_reginput = locinput;
4491
4492             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4493                   "%*s  whilem: matched %ld out of %d..%d\n",
4494                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4495             );
4496
4497             /* First just match a string of min A's. */
4498
4499             if (n < min) {
4500                 cur_curlyx->u.curlyx.lastloc = locinput;
4501                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4502                 /* NOTREACHED */
4503             }
4504
4505             /* If degenerate A matches "", assume A done. */
4506
4507             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4508                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4509                    "%*s  whilem: empty match detected, trying continuation...\n",
4510                    REPORT_CODE_OFF+depth*2, "")
4511                 );
4512                 goto do_whilem_B_max;
4513             }
4514
4515             /* super-linear cache processing */
4516
4517             if (scan->flags) {
4518
4519                 if (!PL_reg_maxiter) {
4520                     /* start the countdown: Postpone detection until we
4521                      * know the match is not *that* much linear. */
4522                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4523                     /* possible overflow for long strings and many CURLYX's */
4524                     if (PL_reg_maxiter < 0)
4525                         PL_reg_maxiter = I32_MAX;
4526                     PL_reg_leftiter = PL_reg_maxiter;
4527                 }
4528
4529                 if (PL_reg_leftiter-- == 0) {
4530                     /* initialise cache */
4531                     const I32 size = (PL_reg_maxiter + 7)/8;
4532                     if (PL_reg_poscache) {
4533                         if ((I32)PL_reg_poscache_size < size) {
4534                             Renew(PL_reg_poscache, size, char);
4535                             PL_reg_poscache_size = size;
4536                         }
4537                         Zero(PL_reg_poscache, size, char);
4538                     }
4539                     else {
4540                         PL_reg_poscache_size = size;
4541                         Newxz(PL_reg_poscache, size, char);
4542                     }
4543                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4544       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4545                               PL_colors[4], PL_colors[5])
4546                     );
4547                 }
4548
4549                 if (PL_reg_leftiter < 0) {
4550                     /* have we already failed at this position? */
4551                     I32 offset, mask;
4552                     offset  = (scan->flags & 0xf) - 1
4553                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4554                     mask    = 1 << (offset % 8);
4555                     offset /= 8;
4556                     if (PL_reg_poscache[offset] & mask) {
4557                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4558                             "%*s  whilem: (cache) already tried at this position...\n",
4559                             REPORT_CODE_OFF+depth*2, "")
4560                         );
4561                         sayNO; /* cache records failure */
4562                     }
4563                     ST.cache_offset = offset;
4564                     ST.cache_mask   = mask;
4565                 }
4566             }
4567
4568             /* Prefer B over A for minimal matching. */
4569
4570             if (cur_curlyx->u.curlyx.minmod) {
4571                 ST.save_curlyx = cur_curlyx;
4572                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4573                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4574                 REGCP_SET(ST.lastcp);
4575                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4576                 /* NOTREACHED */
4577             }
4578
4579             /* Prefer A over B for maximal matching. */
4580
4581             if (n < max) { /* More greed allowed? */
4582                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4583                 cur_curlyx->u.curlyx.lastloc = locinput;
4584                 REGCP_SET(ST.lastcp);
4585                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4586                 /* NOTREACHED */
4587             }
4588             goto do_whilem_B_max;
4589         }
4590         /* NOTREACHED */
4591
4592         case WHILEM_B_min: /* just matched B in a minimal match */
4593         case WHILEM_B_max: /* just matched B in a maximal match */
4594             cur_curlyx = ST.save_curlyx;
4595             sayYES;
4596             /* NOTREACHED */
4597
4598         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4599             cur_curlyx = ST.save_curlyx;
4600             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4601             cur_curlyx->u.curlyx.count--;
4602             CACHEsayNO;
4603             /* NOTREACHED */
4604
4605         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4606             REGCP_UNWIND(ST.lastcp);
4607             regcppop(rex);
4608             /* FALL THROUGH */
4609         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4610             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4611             cur_curlyx->u.curlyx.count--;
4612             CACHEsayNO;
4613             /* NOTREACHED */
4614
4615         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4616             REGCP_UNWIND(ST.lastcp);
4617             regcppop(rex);      /* Restore some previous $<digit>s? */
4618             PL_reginput = locinput;
4619             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4620                 "%*s  whilem: failed, trying continuation...\n",
4621                 REPORT_CODE_OFF+depth*2, "")
4622             );
4623           do_whilem_B_max:
4624             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4625                 && ckWARN(WARN_REGEXP)
4626                 && !(PL_reg_flags & RF_warned))
4627             {
4628                 PL_reg_flags |= RF_warned;
4629                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4630                      "Complex regular subexpression recursion",
4631                      REG_INFTY - 1);
4632             }
4633
4634             /* now try B */
4635             ST.save_curlyx = cur_curlyx;
4636             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4637             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4638             /* NOTREACHED */
4639
4640         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4641             cur_curlyx = ST.save_curlyx;
4642             REGCP_UNWIND(ST.lastcp);
4643             regcppop(rex);
4644
4645             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4646                 /* Maximum greed exceeded */
4647                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4648                     && ckWARN(WARN_REGEXP)
4649                     && !(PL_reg_flags & RF_warned))
4650                 {
4651                     PL_reg_flags |= RF_warned;
4652                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4653                         "%s limit (%d) exceeded",
4654                         "Complex regular subexpression recursion",
4655                         REG_INFTY - 1);
4656                 }
4657                 cur_curlyx->u.curlyx.count--;
4658                 CACHEsayNO;
4659             }
4660
4661             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4662                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4663             );
4664             /* Try grabbing another A and see if it helps. */
4665             PL_reginput = locinput;
4666             cur_curlyx->u.curlyx.lastloc = locinput;
4667             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4668             REGCP_SET(ST.lastcp);
4669             PUSH_STATE_GOTO(WHILEM_A_min,
4670                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4671             /* NOTREACHED */
4672
4673 #undef  ST
4674 #define ST st->u.branch
4675
4676         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4677             next = scan + ARG(scan);
4678             if (next == scan)
4679                 next = NULL;
4680             scan = NEXTOPER(scan);
4681             /* FALL THROUGH */
4682
4683         case BRANCH:        /*  /(...|A|...)/ */
4684             scan = NEXTOPER(scan); /* scan now points to inner node */
4685             ST.lastparen = *PL_reglastparen;
4686             ST.next_branch = next;
4687             REGCP_SET(ST.cp);
4688             PL_reginput = locinput;
4689
4690             /* Now go into the branch */
4691             if (has_cutgroup) {
4692                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4693             } else {
4694                 PUSH_STATE_GOTO(BRANCH_next, scan);
4695             }
4696             /* NOTREACHED */
4697         case CUTGROUP:
4698             PL_reginput = locinput;
4699             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4700                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4701             PUSH_STATE_GOTO(CUTGROUP_next,next);
4702             /* NOTREACHED */
4703         case CUTGROUP_next_fail:
4704             do_cutgroup = 1;
4705             no_final = 1;
4706             if (st->u.mark.mark_name)
4707                 sv_commit = st->u.mark.mark_name;
4708             sayNO;
4709             /* NOTREACHED */
4710         case BRANCH_next:
4711             sayYES;
4712             /* NOTREACHED */
4713         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4714             if (do_cutgroup) {
4715                 do_cutgroup = 0;
4716                 no_final = 0;
4717             }
4718             REGCP_UNWIND(ST.cp);
4719             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4720                 PL_regoffs[n].end = -1;
4721             *PL_reglastparen = n;
4722             /*dmq: *PL_reglastcloseparen = n; */
4723             scan = ST.next_branch;
4724             /* no more branches? */
4725             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4726                 DEBUG_EXECUTE_r({
4727                     PerlIO_printf( Perl_debug_log,
4728                         "%*s  %sBRANCH failed...%s\n",
4729                         REPORT_CODE_OFF+depth*2, "",
4730                         PL_colors[4],
4731                         PL_colors[5] );
4732                 });
4733                 sayNO_SILENT;
4734             }
4735             continue; /* execute next BRANCH[J] op */
4736             /* NOTREACHED */
4737
4738         case MINMOD:
4739             minmod = 1;
4740             break;
4741
4742 #undef  ST
4743 #define ST st->u.curlym
4744
4745         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4746
4747             /* This is an optimisation of CURLYX that enables us to push
4748              * only a single backtracking state, no matter how many matches
4749              * there are in {m,n}. It relies on the pattern being constant
4750              * length, with no parens to influence future backrefs
4751              */
4752
4753             ST.me = scan;
4754             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4755
4756             /* if paren positive, emulate an OPEN/CLOSE around A */
4757             if (ST.me->flags) {
4758                 U32 paren = ST.me->flags;
4759                 if (paren > PL_regsize)
4760                     PL_regsize = paren;
4761                 if (paren > *PL_reglastparen)
4762                     *PL_reglastparen = paren;
4763                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4764             }
4765             ST.A = scan;
4766             ST.B = next;
4767             ST.alen = 0;
4768             ST.count = 0;
4769             ST.minmod = minmod;
4770             minmod = 0;
4771             ST.c1 = CHRTEST_UNINIT;
4772             REGCP_SET(ST.cp);
4773
4774             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4775                 goto curlym_do_B;
4776
4777           curlym_do_A: /* execute the A in /A{m,n}B/  */
4778             PL_reginput = locinput;
4779             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4780             /* NOTREACHED */
4781
4782         case CURLYM_A: /* we've just matched an A */
4783             locinput = st->locinput;
4784             nextchr = UCHARAT(locinput);
4785
4786             ST.count++;
4787             /* after first match, determine A's length: u.curlym.alen */
4788             if (ST.count == 1) {
4789                 if (PL_reg_match_utf8) {
4790                     char *s = locinput;
4791                     while (s < PL_reginput) {
4792                         ST.alen++;
4793                         s += UTF8SKIP(s);
4794                     }
4795                 }
4796                 else {
4797                     ST.alen = PL_reginput - locinput;
4798                 }
4799                 if (ST.alen == 0)
4800                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4801             }
4802             DEBUG_EXECUTE_r(
4803                 PerlIO_printf(Perl_debug_log,
4804                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4805                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4806                           (IV) ST.count, (IV)ST.alen)
4807             );
4808
4809             locinput = PL_reginput;
4810
4811             if (cur_eval && cur_eval->u.eval.close_paren &&
4812                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4813                 goto fake_end;
4814
4815             {
4816                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4817                 if ( max == REG_INFTY || ST.count < max )
4818                     goto curlym_do_A; /* try to match another A */
4819             }
4820             goto curlym_do_B; /* try to match B */
4821
4822         case CURLYM_A_fail: /* just failed to match an A */
4823             REGCP_UNWIND(ST.cp);
4824
4825             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4826                 || (cur_eval && cur_eval->u.eval.close_paren &&
4827                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4828                 sayNO;
4829
4830           curlym_do_B: /* execute the B in /A{m,n}B/  */
4831             PL_reginput = locinput;
4832             if (ST.c1 == CHRTEST_UNINIT) {
4833                 /* calculate c1 and c2 for possible match of 1st char
4834                  * following curly */
4835                 ST.c1 = ST.c2 = CHRTEST_VOID;
4836                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4837                     regnode *text_node = ST.B;
4838                     if (! HAS_TEXT(text_node))
4839                         FIND_NEXT_IMPT(text_node);
4840                     /* this used to be
4841
4842                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4843
4844                         But the former is redundant in light of the latter.
4845
4846                         if this changes back then the macro for
4847                         IS_TEXT and friends need to change.
4848                      */
4849                     if (PL_regkind[OP(text_node)] == EXACT)
4850                     {
4851
4852                         ST.c1 = (U8)*STRING(text_node);
4853                         ST.c2 =
4854                             (IS_TEXTF(text_node))
4855                             ? PL_fold[ST.c1]
4856                             : (IS_TEXTFL(text_node))
4857                                 ? PL_fold_locale[ST.c1]
4858                                 : ST.c1;
4859                     }
4860                 }
4861             }
4862
4863             DEBUG_EXECUTE_r(
4864                 PerlIO_printf(Perl_debug_log,
4865                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4866                     (int)(REPORT_CODE_OFF+(depth*2)),
4867                     "", (IV)ST.count)
4868                 );
4869             if (ST.c1 != CHRTEST_VOID
4870                     && UCHARAT(PL_reginput) != ST.c1
4871                     && UCHARAT(PL_reginput) != ST.c2)
4872             {
4873                 /* simulate B failing */
4874                 DEBUG_OPTIMISE_r(
4875                     PerlIO_printf(Perl_debug_log,
4876                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4877                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4878                         (IV)ST.c1,(IV)ST.c2
4879                 ));
4880                 state_num = CURLYM_B_fail;
4881                 goto reenter_switch;
4882             }
4883
4884             if (ST.me->flags) {
4885                 /* mark current A as captured */
4886                 I32 paren = ST.me->flags;
4887                 if (ST.count) {
4888                     PL_regoffs[paren].start
4889                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4890                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4891                     /*dmq: *PL_reglastcloseparen = paren; */
4892                 }
4893                 else
4894                     PL_regoffs[paren].end = -1;
4895                 if (cur_eval && cur_eval->u.eval.close_paren &&
4896                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4897                 {
4898                     if (ST.count)
4899                         goto fake_end;
4900                     else
4901                         sayNO;
4902                 }
4903             }
4904
4905             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4906             /* NOTREACHED */
4907
4908         case CURLYM_B_fail: /* just failed to match a B */
4909             REGCP_UNWIND(ST.cp);
4910             if (ST.minmod) {
4911                 I32 max = ARG2(ST.me);
4912                 if (max != REG_INFTY && ST.count == max)
4913                     sayNO;
4914                 goto curlym_do_A; /* try to match a further A */
4915             }
4916             /* backtrack one A */
4917             if (ST.count == ARG1(ST.me) /* min */)
4918                 sayNO;
4919             ST.count--;
4920             locinput = HOPc(locinput, -ST.alen);
4921             goto curlym_do_B; /* try to match B */
4922
4923 #undef ST
4924 #define ST st->u.curly
4925
4926 #define CURLY_SETPAREN(paren, success) \
4927     if (paren) { \
4928         if (success) { \
4929             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4930             PL_regoffs[paren].end = locinput - PL_bostr; \
4931             *PL_reglastcloseparen = paren; \
4932         } \
4933         else \
4934             PL_regoffs[paren].end = -1; \
4935     }
4936
4937         case STAR:              /*  /A*B/ where A is width 1 */
4938             ST.paren = 0;
4939             ST.min = 0;
4940             ST.max = REG_INFTY;
4941             scan = NEXTOPER(scan);
4942             goto repeat;
4943         case PLUS:              /*  /A+B/ where A is width 1 */
4944             ST.paren = 0;
4945             ST.min = 1;
4946             ST.max = REG_INFTY;
4947             scan = NEXTOPER(scan);
4948             goto repeat;
4949         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
4950             ST.paren = scan->flags;     /* Which paren to set */
4951             if (ST.paren > PL_regsize)
4952                 PL_regsize = ST.paren;
4953             if (ST.paren > *PL_reglastparen)
4954                 *PL_reglastparen = ST.paren;
4955             ST.min = ARG1(scan);  /* min to match */
4956             ST.max = ARG2(scan);  /* max to match */
4957             if (cur_eval && cur_eval->u.eval.close_paren &&
4958                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4959                 ST.min=1;
4960                 ST.max=1;
4961             }
4962             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4963             goto repeat;
4964         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
4965             ST.paren = 0;
4966             ST.min = ARG1(scan);  /* min to match */
4967             ST.max = ARG2(scan);  /* max to match */
4968             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4969           repeat:
4970             /*
4971             * Lookahead to avoid useless match attempts
4972             * when we know what character comes next.
4973             *
4974             * Used to only do .*x and .*?x, but now it allows
4975             * for )'s, ('s and (?{ ... })'s to be in the way
4976             * of the quantifier and the EXACT-like node.  -- japhy
4977             */
4978
4979             if (ST.min > ST.max) /* XXX make this a compile-time check? */
4980                 sayNO;
4981             if (HAS_TEXT(next) || JUMPABLE(next)) {
4982                 U8 *s;
4983                 regnode *text_node = next;
4984
4985                 if (! HAS_TEXT(text_node))
4986                     FIND_NEXT_IMPT(text_node);
4987
4988                 if (! HAS_TEXT(text_node))
4989                     ST.c1 = ST.c2 = CHRTEST_VOID;
4990                 else {
4991                     if ( PL_regkind[OP(text_node)] != EXACT ) {
4992                         ST.c1 = ST.c2 = CHRTEST_VOID;
4993                         goto assume_ok_easy;
4994                     }
4995                     else
4996                         s = (U8*)STRING(text_node);
4997
4998                     /*  Currently we only get here when
4999
5000                         PL_rekind[OP(text_node)] == EXACT
5001
5002                         if this changes back then the macro for IS_TEXT and
5003                         friends need to change. */
5004                     if (!UTF_PATTERN) {
5005                         ST.c2 = ST.c1 = *s;
5006                         if (IS_TEXTF(text_node))
5007                             ST.c2 = PL_fold[ST.c1];
5008                         else if (IS_TEXTFL(text_node))
5009                             ST.c2 = PL_fold_locale[ST.c1];
5010                     }
5011                     else { /* UTF_PATTERN */
5012                         if (IS_TEXTF(text_node)) {
5013                              STRLEN ulen1, ulen2;
5014                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5015                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5016
5017                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5018                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5019 #ifdef EBCDIC
5020                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5021                                                     ckWARN(WARN_UTF8) ?
5022                                                     0 : UTF8_ALLOW_ANY);
5023                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5024                                                     ckWARN(WARN_UTF8) ?
5025                                                     0 : UTF8_ALLOW_ANY);
5026 #else
5027                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5028                                                     uniflags);
5029                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5030                                                     uniflags);
5031 #endif
5032                         }
5033                         else {
5034                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5035                                                      uniflags);
5036                         }
5037                     }
5038                 }
5039             }
5040             else
5041                 ST.c1 = ST.c2 = CHRTEST_VOID;
5042         assume_ok_easy:
5043
5044             ST.A = scan;
5045             ST.B = next;
5046             PL_reginput = locinput;
5047             if (minmod) {
5048                 minmod = 0;
5049                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5050                     sayNO;
5051                 ST.count = ST.min;
5052                 locinput = PL_reginput;
5053                 REGCP_SET(ST.cp);
5054                 if (ST.c1 == CHRTEST_VOID)
5055                     goto curly_try_B_min;
5056
5057                 ST.oldloc = locinput;
5058
5059                 /* set ST.maxpos to the furthest point along the
5060                  * string that could possibly match */
5061                 if  (ST.max == REG_INFTY) {
5062                     ST.maxpos = PL_regeol - 1;
5063                     if (utf8_target)
5064                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5065                             ST.maxpos--;
5066                 }
5067                 else if (utf8_target) {
5068                     int m = ST.max - ST.min;
5069                     for (ST.maxpos = locinput;
5070                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5071                         ST.maxpos += UTF8SKIP(ST.maxpos);
5072                 }
5073                 else {
5074                     ST.maxpos = locinput + ST.max - ST.min;
5075                     if (ST.maxpos >= PL_regeol)
5076                         ST.maxpos = PL_regeol - 1;
5077                 }
5078                 goto curly_try_B_min_known;
5079
5080             }
5081             else {
5082                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5083                 locinput = PL_reginput;
5084                 if (ST.count < ST.min)
5085                     sayNO;
5086                 if ((ST.count > ST.min)
5087                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5088                 {
5089                     /* A{m,n} must come at the end of the string, there's
5090                      * no point in backing off ... */
5091                     ST.min = ST.count;
5092                     /* ...except that $ and \Z can match before *and* after
5093                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5094                        We may back off by one in this case. */
5095                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5096                         ST.min--;
5097                 }
5098                 REGCP_SET(ST.cp);
5099                 goto curly_try_B_max;
5100             }
5101             /* NOTREACHED */
5102
5103
5104         case CURLY_B_min_known_fail:
5105             /* failed to find B in a non-greedy match where c1,c2 valid */
5106             if (ST.paren && ST.count)
5107                 PL_regoffs[ST.paren].end = -1;
5108
5109             PL_reginput = locinput;     /* Could be reset... */
5110             REGCP_UNWIND(ST.cp);
5111             /* Couldn't or didn't -- move forward. */
5112             ST.oldloc = locinput;
5113             if (utf8_target)
5114                 locinput += UTF8SKIP(locinput);
5115             else
5116                 locinput++;
5117             ST.count++;
5118           curly_try_B_min_known:
5119              /* find the next place where 'B' could work, then call B */
5120             {
5121                 int n;
5122                 if (utf8_target) {
5123                     n = (ST.oldloc == locinput) ? 0 : 1;
5124                     if (ST.c1 == ST.c2) {
5125                         STRLEN len;
5126                         /* set n to utf8_distance(oldloc, locinput) */
5127                         while (locinput <= ST.maxpos &&
5128                                utf8n_to_uvchr((U8*)locinput,
5129                                               UTF8_MAXBYTES, &len,
5130                                               uniflags) != (UV)ST.c1) {
5131                             locinput += len;
5132                             n++;
5133                         }
5134                     }
5135                     else {
5136                         /* set n to utf8_distance(oldloc, locinput) */
5137                         while (locinput <= ST.maxpos) {
5138                             STRLEN len;
5139                             const UV c = utf8n_to_uvchr((U8*)locinput,
5140                                                   UTF8_MAXBYTES, &len,
5141                                                   uniflags);
5142                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5143                                 break;
5144                             locinput += len;
5145                             n++;
5146                         }
5147                     }
5148                 }
5149                 else {
5150                     if (ST.c1 == ST.c2) {
5151                         while (locinput <= ST.maxpos &&
5152                                UCHARAT(locinput) != ST.c1)
5153                             locinput++;
5154                     }
5155                     else {
5156                         while (locinput <= ST.maxpos
5157                                && UCHARAT(locinput) != ST.c1
5158                                && UCHARAT(locinput) != ST.c2)
5159                             locinput++;
5160                     }
5161                     n = locinput - ST.oldloc;
5162                 }
5163                 if (locinput > ST.maxpos)
5164                     sayNO;
5165                 /* PL_reginput == oldloc now */
5166                 if (n) {
5167                     ST.count += n;
5168                     if (regrepeat(rex, ST.A, n, depth) < n)
5169                         sayNO;
5170                 }
5171                 PL_reginput = locinput;
5172                 CURLY_SETPAREN(ST.paren, ST.count);
5173                 if (cur_eval && cur_eval->u.eval.close_paren &&
5174                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5175                     goto fake_end;
5176                 }
5177                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5178             }
5179             /* NOTREACHED */
5180
5181
5182         case CURLY_B_min_fail:
5183             /* failed to find B in a non-greedy match where c1,c2 invalid */
5184             if (ST.paren && ST.count)
5185                 PL_regoffs[ST.paren].end = -1;
5186
5187             REGCP_UNWIND(ST.cp);
5188             /* failed -- move forward one */
5189             PL_reginput = locinput;
5190             if (regrepeat(rex, ST.A, 1, depth)) {
5191                 ST.count++;
5192                 locinput = PL_reginput;
5193                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5194                         ST.count > 0)) /* count overflow ? */
5195                 {
5196                   curly_try_B_min:
5197                     CURLY_SETPAREN(ST.paren, ST.count);
5198                     if (cur_eval && cur_eval->u.eval.close_paren &&
5199                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5200                         goto fake_end;
5201                     }
5202                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5203                 }
5204             }
5205             sayNO;
5206             /* NOTREACHED */
5207
5208
5209         curly_try_B_max:
5210             /* a successful greedy match: now try to match B */
5211             if (cur_eval && cur_eval->u.eval.close_paren &&
5212                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5213                 goto fake_end;
5214             }
5215             {
5216                 UV c = 0;
5217                 if (ST.c1 != CHRTEST_VOID)
5218                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5219                                            UTF8_MAXBYTES, 0, uniflags)
5220                                 : (UV) UCHARAT(PL_reginput);
5221                 /* If it could work, try it. */
5222                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5223                     CURLY_SETPAREN(ST.paren, ST.count);
5224                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5225                     /* NOTREACHED */
5226                 }
5227             }
5228             /* FALL THROUGH */
5229         case CURLY_B_max_fail:
5230             /* failed to find B in a greedy match */
5231             if (ST.paren && ST.count)
5232                 PL_regoffs[ST.paren].end = -1;
5233
5234             REGCP_UNWIND(ST.cp);
5235             /*  back up. */
5236             if (--ST.count < ST.min)
5237                 sayNO;
5238             PL_reginput = locinput = HOPc(locinput, -1);
5239             goto curly_try_B_max;
5240
5241 #undef ST
5242
5243         case END:
5244             fake_end:
5245             if (cur_eval) {
5246                 /* we've just finished A in /(??{A})B/; now continue with B */
5247                 I32 tmpix;
5248                 st->u.eval.toggle_reg_flags
5249                             = cur_eval->u.eval.toggle_reg_flags;
5250                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5251
5252                 st->u.eval.prev_rex = rex_sv;           /* inner */
5253                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5254                 rex = (struct regexp *)SvANY(rex_sv);
5255                 rexi = RXi_GET(rex);
5256                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5257                 ReREFCNT_inc(rex_sv);
5258                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5259
5260                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5261                 PL_reglastparen = &rex->lastparen;
5262                 PL_reglastcloseparen = &rex->lastcloseparen;
5263
5264                 REGCP_SET(st->u.eval.lastcp);
5265                 PL_reginput = locinput;
5266
5267                 /* Restore parens of the outer rex without popping the
5268                  * savestack */
5269                 tmpix = PL_savestack_ix;
5270                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5271                 regcppop(rex);
5272                 PL_savestack_ix = tmpix;
5273
5274                 st->u.eval.prev_eval = cur_eval;
5275                 cur_eval = cur_eval->u.eval.prev_eval;
5276                 DEBUG_EXECUTE_r(
5277                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5278                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5279                 if ( nochange_depth )
5280                     nochange_depth--;
5281
5282                 PUSH_YES_STATE_GOTO(EVAL_AB,
5283                         st->u.eval.prev_eval->u.eval.B); /* match B */
5284             }
5285
5286             if (locinput < reginfo->till) {
5287                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5288                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5289                                       PL_colors[4],
5290                                       (long)(locinput - PL_reg_starttry),
5291                                       (long)(reginfo->till - PL_reg_starttry),
5292                                       PL_colors[5]));
5293
5294                 sayNO_SILENT;           /* Cannot match: too short. */
5295             }
5296             PL_reginput = locinput;     /* put where regtry can find it */
5297             sayYES;                     /* Success! */
5298
5299         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5300             DEBUG_EXECUTE_r(
5301             PerlIO_printf(Perl_debug_log,
5302                 "%*s  %ssubpattern success...%s\n",
5303                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5304             PL_reginput = locinput;     /* put where regtry can find it */
5305             sayYES;                     /* Success! */
5306
5307 #undef  ST
5308 #define ST st->u.ifmatch
5309
5310         case SUSPEND:   /* (?>A) */
5311             ST.wanted = 1;
5312             PL_reginput = locinput;
5313             goto do_ifmatch;
5314
5315         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5316             ST.wanted = 0;
5317             goto ifmatch_trivial_fail_test;
5318
5319         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5320             ST.wanted = 1;
5321           ifmatch_trivial_fail_test:
5322             if (scan->flags) {
5323                 char * const s = HOPBACKc(locinput, scan->flags);
5324                 if (!s) {
5325                     /* trivial fail */
5326                     if (logical) {
5327                         logical = 0;
5328                         sw = 1 - cBOOL(ST.wanted);
5329                     }
5330                     else if (ST.wanted)
5331                         sayNO;
5332                     next = scan + ARG(scan);
5333                     if (next == scan)
5334                         next = NULL;
5335                     break;
5336                 }
5337                 PL_reginput = s;
5338             }
5339             else
5340                 PL_reginput = locinput;
5341
5342           do_ifmatch:
5343             ST.me = scan;
5344             ST.logical = logical;
5345             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5346
5347             /* execute body of (?...A) */
5348             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5349             /* NOTREACHED */
5350
5351         case IFMATCH_A_fail: /* body of (?...A) failed */
5352             ST.wanted = !ST.wanted;
5353             /* FALL THROUGH */
5354
5355         case IFMATCH_A: /* body of (?...A) succeeded */
5356             if (ST.logical) {
5357                 sw = cBOOL(ST.wanted);
5358             }
5359             else if (!ST.wanted)
5360                 sayNO;
5361
5362             if (OP(ST.me) == SUSPEND)
5363                 locinput = PL_reginput;
5364             else {
5365                 locinput = PL_reginput = st->locinput;
5366                 nextchr = UCHARAT(locinput);
5367             }
5368             scan = ST.me + ARG(ST.me);
5369             if (scan == ST.me)
5370                 scan = NULL;
5371             continue; /* execute B */
5372
5373 #undef ST
5374
5375         case LONGJMP:
5376             next = scan + ARG(scan);
5377             if (next == scan)
5378                 next = NULL;
5379             break;
5380         case COMMIT:
5381             reginfo->cutpoint = PL_regeol;
5382             /* FALLTHROUGH */
5383         case PRUNE:
5384             PL_reginput = locinput;
5385             if (!scan->flags)
5386                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5387             PUSH_STATE_GOTO(COMMIT_next,next);
5388             /* NOTREACHED */
5389         case COMMIT_next_fail:
5390             no_final = 1;
5391             /* FALLTHROUGH */
5392         case OPFAIL:
5393             sayNO;
5394             /* NOTREACHED */
5395
5396 #define ST st->u.mark
5397         case MARKPOINT:
5398             ST.prev_mark = mark_state;
5399             ST.mark_name = sv_commit = sv_yes_mark
5400                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5401             mark_state = st;
5402             ST.mark_loc = PL_reginput = locinput;
5403             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5404             /* NOTREACHED */
5405         case MARKPOINT_next:
5406             mark_state = ST.prev_mark;
5407             sayYES;
5408             /* NOTREACHED */
5409         case MARKPOINT_next_fail:
5410             if (popmark && sv_eq(ST.mark_name,popmark))
5411             {
5412                 if (ST.mark_loc > startpoint)
5413                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5414                 popmark = NULL; /* we found our mark */
5415                 sv_commit = ST.mark_name;
5416
5417                 DEBUG_EXECUTE_r({
5418                         PerlIO_printf(Perl_debug_log,
5419                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5420                             REPORT_CODE_OFF+depth*2, "",
5421                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5422                 });
5423             }
5424             mark_state = ST.prev_mark;
5425             sv_yes_mark = mark_state ?
5426                 mark_state->u.mark.mark_name : NULL;
5427             sayNO;
5428             /* NOTREACHED */
5429         case SKIP:
5430             PL_reginput = locinput;
5431             if (scan->flags) {
5432                 /* (*SKIP) : if we fail we cut here*/
5433                 ST.mark_name = NULL;
5434                 ST.mark_loc = locinput;
5435                 PUSH_STATE_GOTO(SKIP_next,next);
5436             } else {
5437                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5438                    otherwise do nothing.  Meaning we need to scan
5439                  */
5440                 regmatch_state *cur = mark_state;
5441                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5442
5443                 while (cur) {
5444                     if ( sv_eq( cur->u.mark.mark_name,
5445                                 find ) )
5446                     {
5447                         ST.mark_name = find;
5448                         PUSH_STATE_GOTO( SKIP_next, next );
5449                     }
5450                     cur = cur->u.mark.prev_mark;
5451                 }
5452             }
5453             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5454             break;
5455         case SKIP_next_fail:
5456             if (ST.mark_name) {
5457                 /* (*CUT:NAME) - Set up to search for the name as we
5458                    collapse the stack*/
5459                 popmark = ST.mark_name;
5460             } else {
5461                 /* (*CUT) - No name, we cut here.*/
5462                 if (ST.mark_loc > startpoint)
5463                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5464                 /* but we set sv_commit to latest mark_name if there
5465                    is one so they can test to see how things lead to this
5466                    cut */
5467                 if (mark_state)
5468                     sv_commit=mark_state->u.mark.mark_name;
5469             }
5470             no_final = 1;
5471             sayNO;
5472             /* NOTREACHED */
5473 #undef ST
5474         case FOLDCHAR:
5475             n = ARG(scan);
5476             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5477                 locinput += ln;
5478             } else if ( 0xDF == n && !utf8_target && !UTF_PATTERN ) {
5479                 sayNO;
5480             } else  {
5481                 U8 folded[UTF8_MAXBYTES_CASE+1];
5482                 STRLEN foldlen;
5483                 const char * const l = locinput;
5484                 char *e = PL_regeol;
5485                 to_uni_fold(n, folded, &foldlen);
5486
5487                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5488                                l, &e, 0,  utf8_target)) {
5489                         sayNO;
5490                 }
5491                 locinput = e;
5492             }
5493             nextchr = UCHARAT(locinput);
5494             break;
5495         case LNBREAK:
5496             if ((n=is_LNBREAK(locinput,utf8_target))) {
5497                 locinput += n;
5498                 nextchr = UCHARAT(locinput);
5499             } else
5500                 sayNO;
5501             break;
5502
5503 #define CASE_CLASS(nAmE)                              \
5504         case nAmE:                                    \
5505             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5506                 locinput += n;                        \
5507                 nextchr = UCHARAT(locinput);          \
5508             } else                                    \
5509                 sayNO;                                \
5510             break;                                    \
5511         case N##nAmE:                                 \
5512             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5513                 sayNO;                                \
5514             } else {                                  \
5515                 locinput += UTF8SKIP(locinput);       \
5516                 nextchr = UCHARAT(locinput);          \
5517             }                                         \
5518             break
5519
5520         CASE_CLASS(VERTWS);
5521         CASE_CLASS(HORIZWS);
5522 #undef CASE_CLASS
5523
5524         default:
5525             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5526                           PTR2UV(scan), OP(scan));
5527             Perl_croak(aTHX_ "regexp memory corruption");
5528
5529         } /* end switch */
5530
5531         /* switch break jumps here */
5532         scan = next; /* prepare to execute the next op and ... */
5533         continue;    /* ... jump back to the top, reusing st */
5534         /* NOTREACHED */
5535
5536       push_yes_state:
5537         /* push a state that backtracks on success */
5538         st->u.yes.prev_yes_state = yes_state;
5539         yes_state = st;
5540         /* FALL THROUGH */
5541       push_state:
5542         /* push a new regex state, then continue at scan  */
5543         {
5544             regmatch_state *newst;
5545
5546             DEBUG_STACK_r({
5547                 regmatch_state *cur = st;
5548                 regmatch_state *curyes = yes_state;
5549                 int curd = depth;
5550                 regmatch_slab *slab = PL_regmatch_slab;
5551                 for (;curd > -1;cur--,curd--) {
5552                     if (cur < SLAB_FIRST(slab)) {
5553                         slab = slab->prev;
5554                         cur = SLAB_LAST(slab);
5555                     }
5556                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5557                         REPORT_CODE_OFF + 2 + depth * 2,"",
5558                         curd, PL_reg_name[cur->resume_state],
5559                         (curyes == cur) ? "yes" : ""
5560                     );
5561                     if (curyes == cur)
5562                         curyes = cur->u.yes.prev_yes_state;
5563                 }
5564             } else
5565                 DEBUG_STATE_pp("push")
5566             );
5567             depth++;
5568             st->locinput = locinput;
5569             newst = st+1;
5570             if (newst >  SLAB_LAST(PL_regmatch_slab))
5571                 newst = S_push_slab(aTHX);
5572             PL_regmatch_state = newst;
5573
5574             locinput = PL_reginput;
5575             nextchr = UCHARAT(locinput);
5576             st = newst;
5577             continue;
5578             /* NOTREACHED */
5579         }
5580     }
5581
5582     /*
5583     * We get here only if there's trouble -- normally "case END" is
5584     * the terminating point.
5585     */
5586     Perl_croak(aTHX_ "corrupted regexp pointers");
5587     /*NOTREACHED*/
5588     sayNO;
5589
5590 yes:
5591     if (yes_state) {
5592         /* we have successfully completed a subexpression, but we must now
5593          * pop to the state marked by yes_state and continue from there */
5594         assert(st != yes_state);
5595 #ifdef DEBUGGING
5596         while (st != yes_state) {
5597             st--;
5598             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5599                 PL_regmatch_slab = PL_regmatch_slab->prev;
5600                 st = SLAB_LAST(PL_regmatch_slab);
5601             }
5602             DEBUG_STATE_r({
5603                 if (no_final) {
5604                     DEBUG_STATE_pp("pop (no final)");
5605                 } else {
5606                     DEBUG_STATE_pp("pop (yes)");
5607                 }
5608             });
5609             depth--;
5610         }
5611 #else
5612         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5613             || yes_state > SLAB_LAST(PL_regmatch_slab))
5614         {
5615             /* not in this slab, pop slab */
5616             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5617             PL_regmatch_slab = PL_regmatch_slab->prev;
5618             st = SLAB_LAST(PL_regmatch_slab);
5619         }
5620         depth -= (st - yes_state);
5621 #endif
5622         st = yes_state;
5623         yes_state = st->u.yes.prev_yes_state;
5624         PL_regmatch_state = st;
5625
5626         if (no_final) {
5627             locinput= st->locinput;
5628             nextchr = UCHARAT(locinput);
5629         }
5630         state_num = st->resume_state + no_final;
5631         goto reenter_switch;
5632     }
5633
5634     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5635                           PL_colors[4], PL_colors[5]));
5636
5637     if (PL_reg_eval_set) {
5638         /* each successfully executed (?{...}) block does the equivalent of
5639          *   local $^R = do {...}
5640          * When popping the save stack, all these locals would be undone;
5641          * bypass this by setting the outermost saved $^R to the latest
5642          * value */
5643         if (oreplsv != GvSV(PL_replgv))
5644             sv_setsv(oreplsv, GvSV(PL_replgv));
5645     }
5646     result = 1;
5647     goto final_exit;
5648
5649 no:
5650     DEBUG_EXECUTE_r(
5651         PerlIO_printf(Perl_debug_log,
5652             "%*s  %sfailed...%s\n",
5653             REPORT_CODE_OFF+depth*2, "",
5654             PL_colors[4], PL_colors[5])
5655         );
5656
5657 no_silent:
5658     if (no_final) {
5659         if (yes_state) {
5660             goto yes;
5661         } else {
5662             goto final_exit;
5663         }
5664     }
5665     if (depth) {
5666         /* there's a previous state to backtrack to */
5667         st--;
5668         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5669             PL_regmatch_slab = PL_regmatch_slab->prev;
5670             st = SLAB_LAST(PL_regmatch_slab);
5671         }
5672         PL_regmatch_state = st;
5673         locinput= st->locinput;
5674         nextchr = UCHARAT(locinput);
5675
5676         DEBUG_STATE_pp("pop");
5677         depth--;
5678         if (yes_state == st)
5679             yes_state = st->u.yes.prev_yes_state;
5680
5681         state_num = st->resume_state + 1; /* failure = success + 1 */
5682         goto reenter_switch;
5683     }
5684     result = 0;
5685
5686   final_exit:
5687     if (rex->intflags & PREGf_VERBARG_SEEN) {
5688         SV *sv_err = get_sv("REGERROR", 1);
5689         SV *sv_mrk = get_sv("REGMARK", 1);
5690         if (result) {
5691             sv_commit = &PL_sv_no;
5692             if (!sv_yes_mark)
5693                 sv_yes_mark = &PL_sv_yes;
5694         } else {
5695             if (!sv_commit)
5696                 sv_commit = &PL_sv_yes;
5697             sv_yes_mark = &PL_sv_no;
5698         }
5699         sv_setsv(sv_err, sv_commit);
5700         sv_setsv(sv_mrk, sv_yes_mark);
5701     }
5702
5703     /* clean up; in particular, free all slabs above current one */
5704     LEAVE_SCOPE(oldsave);
5705
5706     return result;
5707 }
5708
5709 /*
5710  - regrepeat - repeatedly match something simple, report how many
5711  */
5712 /*
5713  * [This routine now assumes that it will only match on things of length 1.
5714  * That was true before, but now we assume scan - reginput is the count,
5715  * rather than incrementing count on every character.  [Er, except utf8.]]
5716  */
5717 STATIC I32
5718 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5719 {
5720     dVAR;
5721     register char *scan;
5722     register I32 c;
5723     register char *loceol = PL_regeol;
5724     register I32 hardcount = 0;
5725     register bool utf8_target = PL_reg_match_utf8;
5726 #ifndef DEBUGGING
5727     PERL_UNUSED_ARG(depth);
5728 #endif
5729
5730     PERL_ARGS_ASSERT_REGREPEAT;
5731
5732     scan = PL_reginput;
5733     if (max == REG_INFTY)
5734         max = I32_MAX;
5735     else if (max < loceol - scan)
5736         loceol = scan + max;
5737     switch (OP(p)) {
5738     case REG_ANY:
5739         if (utf8_target) {
5740             loceol = PL_regeol;
5741             while (scan < loceol && hardcount < max && *scan != '\n') {
5742                 scan += UTF8SKIP(scan);
5743                 hardcount++;
5744             }
5745         } else {
5746             while (scan < loceol && *scan != '\n')
5747                 scan++;
5748         }
5749         break;
5750     case SANY:
5751         if (utf8_target) {
5752             loceol = PL_regeol;
5753             while (scan < loceol && hardcount < max) {
5754                 scan += UTF8SKIP(scan);
5755                 hardcount++;
5756             }
5757         }
5758         else
5759             scan = loceol;
5760         break;
5761     case CANY:
5762         scan = loceol;
5763         break;
5764     case EXACTFL:
5765         PL_reg_flags |= RF_tainted;
5766         /* FALL THROUGH */
5767     case EXACT:
5768     case EXACTF:
5769         /* To get here, EXACTish nodes must have *byte* length == 1.  That means
5770          * they match only characters in the string that can be expressed as a
5771          * single byte.  For non-utf8 strings, that means a simple match.  For
5772          * utf8 strings, the character matched must be an invariant, or
5773          * downgradable to a single byte.  The pattern's utf8ness is
5774          * irrelevant, as it must be a single byte, so either it isn't utf8, or
5775          * if it is it's an invariant */
5776
5777         c = (U8)*STRING(p);
5778         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5779
5780         if ((! utf8_target) || UNI_IS_INVARIANT(c)) {
5781
5782             /* Here, the string isn't utf8, or the character in the EXACT
5783              * node is the same in utf8 as not, so can just do equality.
5784              * Each matching char must be 1 byte long */
5785             switch (OP(p)) {
5786             case EXACT:
5787                 while (scan < loceol && UCHARAT(scan) == c) {
5788                     scan++;
5789                 }
5790                 break;
5791             case EXACTF:
5792                 while (scan < loceol &&
5793                     (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5794                 {
5795                     scan++;
5796                 }
5797                 break;
5798             case EXACTFL:
5799                 while (scan < loceol &&
5800                     (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5801                 {
5802                     scan++;
5803                 }
5804                 break;
5805             default:
5806                 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
5807             }
5808         }
5809         else {
5810
5811             /* Here, the string is utf8, and the pattern char is different
5812              * in utf8 than not.  */
5813
5814             switch (OP(p)) {
5815             case EXACT:
5816                 {
5817                     /* Fastest to find the two utf8 bytes that represent c, and
5818                      * then look for those in sequence in the utf8 string */
5819                     U8 high = UTF8_TWO_BYTE_HI(c);
5820                     U8 low = UTF8_TWO_BYTE_LO(c);
5821                     loceol = PL_regeol;
5822
5823                     while (hardcount < max
5824                            && scan + 1 < loceol
5825                            && UCHARAT(scan) == high
5826                            && UCHARAT(scan + 1) == low)
5827                     {
5828                         scan += 2;
5829                         hardcount++;
5830                     }
5831                 }
5832                 break;
5833             case EXACTFL:   /* Doesn't really make sense, but is best we can
5834                                do.  The documents warn against mixing locale
5835                                and utf8 */
5836             case EXACTF:
5837                 {   /* utf8 string, so use utf8 foldEQ */
5838                     char *tmpeol = loceol;
5839                     while (hardcount < max
5840                            && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
5841                                           STRING(p), NULL, 1, UTF_PATTERN))
5842                     {
5843                         scan = tmpeol;
5844                         tmpeol = loceol;
5845                         hardcount++;
5846                     }
5847
5848                     /* XXX Note that the above handles properly the German
5849                      * sharp ss in the pattern matching ss in the string.  But
5850                      * it doesn't handle properly cases where the string
5851                      * contains say 'LIGATURE ff' and the pattern is 'f+'.
5852                      * This would require, say, a new function or revised
5853                      * interface to foldEQ_utf8(), in which the maximum number
5854                      * of characters to match could be passed and it would
5855                      * return how many actually did.  This is just one of many
5856                      * cases where multi-char folds don't work properly, and so
5857                      * the fix is being deferred */
5858                 }
5859                 break;
5860             default:
5861                 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
5862             }
5863         }
5864         break;
5865     case ANYOF:
5866         if (utf8_target) {
5867             loceol = PL_regeol;
5868             while (hardcount < max && scan < loceol &&
5869                    reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
5870                 scan += UTF8SKIP(scan);
5871                 hardcount++;
5872             }
5873         } else {
5874             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5875                 scan++;
5876         }
5877         break;
5878     case ALNUM:
5879         if (utf8_target) {
5880             loceol = PL_regeol;
5881             LOAD_UTF8_CHARCLASS_ALNUM();
5882             while (hardcount < max && scan < loceol &&
5883                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5884             {
5885                 scan += UTF8SKIP(scan);
5886                 hardcount++;
5887             }
5888         } else if (FLAGS(p) & USE_UNI) {
5889             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
5890                 scan++;
5891             }
5892         } else {
5893             while (scan < loceol && isALNUM((U8) *scan)) {
5894                 scan++;
5895             }
5896         }
5897         break;
5898     case ALNUML:
5899         PL_reg_flags |= RF_tainted;
5900         if (utf8_target) {
5901             loceol = PL_regeol;
5902             while (hardcount < max && scan < loceol &&
5903                    isALNUM_LC_utf8((U8*)scan)) {
5904                 scan += UTF8SKIP(scan);
5905                 hardcount++;
5906             }
5907         } else {
5908             while (scan < loceol && isALNUM_LC(*scan))
5909                 scan++;
5910         }
5911         break;
5912     case NALNUM:
5913         if (utf8_target) {
5914             loceol = PL_regeol;
5915             LOAD_UTF8_CHARCLASS_ALNUM();
5916             while (hardcount < max && scan < loceol &&
5917                    !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5918             {
5919                 scan += UTF8SKIP(scan);
5920                 hardcount++;
5921             }
5922         } else if (FLAGS(p) & USE_UNI) {
5923             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
5924                 scan++;
5925             }
5926         } else {
5927             while (scan < loceol && ! isALNUM((U8) *scan)) {
5928                 scan++;
5929             }
5930         }
5931         break;
5932     case NALNUML:
5933         PL_reg_flags |= RF_tainted;
5934         if (utf8_target) {
5935             loceol = PL_regeol;
5936             while (hardcount < max && scan < loceol &&
5937                    !isALNUM_LC_utf8((U8*)scan)) {
5938                 scan += UTF8SKIP(scan);
5939                 hardcount++;
5940             }
5941         } else {
5942             while (scan < loceol && !isALNUM_LC(*scan))
5943                 scan++;
5944         }
5945         break;
5946     case SPACE:
5947         if (utf8_target) {
5948             loceol = PL_regeol;
5949             LOAD_UTF8_CHARCLASS_SPACE();
5950             while (hardcount < max && scan < loceol &&
5951                    (*scan == ' ' ||
5952                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5953             {
5954                 scan += UTF8SKIP(scan);
5955                 hardcount++;
5956             }
5957         } else if (FLAGS(p) & USE_UNI) {
5958             while (scan < loceol && isSPACE_L1((U8) *scan)) {
5959                 scan++;
5960             }
5961         } else {
5962             while (scan < loceol && isSPACE((U8) *scan))
5963                 scan++;
5964         }
5965         break;
5966     case SPACEL:
5967         PL_reg_flags |= RF_tainted;
5968         if (utf8_target) {
5969             loceol = PL_regeol;
5970             while (hardcount < max && scan < loceol &&
5971                    (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5972                 scan += UTF8SKIP(scan);
5973                 hardcount++;
5974             }
5975         } else {
5976             while (scan < loceol && isSPACE_LC(*scan))
5977                 scan++;
5978         }
5979         break;
5980     case NSPACE:
5981         if (utf8_target) {
5982             loceol = PL_regeol;
5983             LOAD_UTF8_CHARCLASS_SPACE();
5984             while (hardcount < max && scan < loceol &&
5985                    !(*scan == ' ' ||
5986                      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5987             {
5988                 scan += UTF8SKIP(scan);
5989                 hardcount++;
5990             }
5991         } else if (FLAGS(p) & USE_UNI) {
5992             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
5993                 scan++;
5994             }
5995         } else {
5996             while (scan < loceol && ! isSPACE((U8) *scan)) {
5997                 scan++;
5998             }
5999         }
6000         break;
6001     case NSPACEL:
6002         PL_reg_flags |= RF_tainted;
6003         if (utf8_target) {
6004             loceol = PL_regeol;
6005             while (hardcount < max && scan < loceol &&
6006                    !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
6007                 scan += UTF8SKIP(scan);
6008                 hardcount++;
6009             }
6010         } else {
6011             while (scan < loceol && !isSPACE_LC(*scan))
6012                 scan++;
6013         }
6014         break;
6015     case DIGIT:
6016         if (utf8_target) {
6017             loceol = PL_regeol;
6018             LOAD_UTF8_CHARCLASS_DIGIT();
6019             while (hardcount < max && scan < loceol &&
6020                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6021                 scan += UTF8SKIP(scan);
6022                 hardcount++;
6023             }
6024         } else {
6025             while (scan < loceol && isDIGIT(*scan))
6026                 scan++;
6027         }
6028         break;
6029     case NDIGIT:
6030         if (utf8_target) {
6031             loceol = PL_regeol;
6032             LOAD_UTF8_CHARCLASS_DIGIT();
6033             while (hardcount < max && scan < loceol &&
6034                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6035                 scan += UTF8SKIP(scan);
6036                 hardcount++;
6037             }
6038         } else {
6039             while (scan < loceol && !isDIGIT(*scan))
6040                 scan++;
6041         }
6042     case LNBREAK:
6043         if (utf8_target) {
6044             loceol = PL_regeol;
6045             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6046                 scan += c;
6047                 hardcount++;
6048             }
6049         } else {
6050             /*
6051               LNBREAK can match two latin chars, which is ok,
6052               because we have a null terminated string, but we
6053               have to use hardcount in this situation
6054             */
6055             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6056                 scan+=c;
6057                 hardcount++;
6058             }
6059         }
6060         break;
6061     case HORIZWS:
6062         if (utf8_target) {
6063             loceol = PL_regeol;
6064             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6065                 scan += c;
6066                 hardcount++;
6067             }
6068         } else {
6069             while (scan < loceol && is_HORIZWS_latin1(scan))
6070                 scan++;
6071         }
6072         break;
6073     case NHORIZWS:
6074         if (utf8_target) {
6075             loceol = PL_regeol;
6076             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6077                 scan += UTF8SKIP(scan);
6078                 hardcount++;
6079             }
6080         } else {
6081             while (scan < loceol && !is_HORIZWS_latin1(scan))
6082                 scan++;
6083
6084         }
6085         break;
6086     case VERTWS:
6087         if (utf8_target) {
6088             loceol = PL_regeol;
6089             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6090                 scan += c;
6091                 hardcount++;
6092             }
6093         } else {
6094             while (scan < loceol && is_VERTWS_latin1(scan))
6095                 scan++;
6096
6097         }
6098         break;
6099     case NVERTWS:
6100         if (utf8_target) {
6101             loceol = PL_regeol;
6102             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6103                 scan += UTF8SKIP(scan);
6104                 hardcount++;
6105             }
6106         } else {
6107             while (scan < loceol && !is_VERTWS_latin1(scan))
6108                 scan++;
6109
6110         }
6111         break;
6112
6113     default:            /* Called on something of 0 width. */
6114         break;          /* So match right here or not at all. */
6115     }
6116
6117     if (hardcount)
6118         c = hardcount;
6119     else
6120         c = scan - PL_reginput;
6121     PL_reginput = scan;
6122
6123     DEBUG_r({
6124         GET_RE_DEBUG_FLAGS_DECL;
6125         DEBUG_EXECUTE_r({
6126             SV * const prop = sv_newmortal();
6127             regprop(prog, prop, p);
6128             PerlIO_printf(Perl_debug_log,
6129                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6130                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6131         });
6132     });
6133
6134     return(c);
6135 }
6136
6137
6138 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6139 /*
6140 - regclass_swash - prepare the utf8 swash
6141 */
6142
6143 SV *
6144 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6145 {
6146     dVAR;
6147     SV *sw  = NULL;
6148     SV *si  = NULL;
6149     SV *alt = NULL;
6150     RXi_GET_DECL(prog,progi);
6151     const struct reg_data * const data = prog ? progi->data : NULL;
6152
6153     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6154
6155     if (data && data->count) {
6156         const U32 n = ARG(node);
6157
6158         if (data->what[n] == 's') {
6159             SV * const rv = MUTABLE_SV(data->data[n]);
6160             AV * const av = MUTABLE_AV(SvRV(rv));
6161             SV **const ary = AvARRAY(av);
6162             SV **a, **b;
6163
6164             /* See the end of regcomp.c:S_regclass() for
6165              * documentation of these array elements. */
6166
6167             si = *ary;
6168             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6169             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6170
6171             if (a)
6172                 sw = *a;
6173             else if (si && doinit) {
6174                 sw = swash_init("utf8", "", si, 1, 0);
6175                 (void)av_store(av, 1, sw);
6176             }
6177             if (b)
6178                 alt = *b;
6179         }
6180     }
6181
6182     if (listsvp)
6183         *listsvp = si;
6184     if (altsvp)
6185         *altsvp  = alt;
6186
6187     return sw;
6188 }
6189 #endif
6190
6191 /*
6192  - reginclass - determine if a character falls into a character class
6193
6194   n is the ANYOF regnode
6195   p is the target string
6196   lenp is pointer to the maximum number of bytes of how far to go in p
6197     (This is assumed wthout checking to always be at least the current
6198     character's size)
6199   utf8_target tells whether p is in UTF-8.
6200
6201   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6202   from a successful match, the value it points to will be updated to how many
6203   bytes in p were matched.  If there was no match, the value is undefined,
6204   possibly changed from the input.
6205
6206  */
6207
6208 STATIC bool
6209 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6210 {
6211     dVAR;
6212     const char flags = ANYOF_FLAGS(n);
6213     bool match = FALSE;
6214     UV c = *p;
6215     STRLEN c_len = 0;
6216     STRLEN maxlen;
6217
6218     PERL_ARGS_ASSERT_REGINCLASS;
6219
6220     /* If c is not already the code point, get it */
6221     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6222         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6223                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6224                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6225                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6226                  * UTF8_ALLOW_FFFF */
6227         if (c_len == (STRLEN)-1)
6228             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6229     }
6230     else {
6231         c_len = 1;
6232     }
6233
6234     /* Use passed in max length, or one character if none passed in or less
6235      * than one character.  And assume will match just one character.  This is
6236      * overwritten later if matched more. */
6237     if (lenp) {
6238         maxlen = (*lenp > c_len) ? *lenp : c_len;
6239         *lenp = c_len;
6240
6241     }
6242     else {
6243         maxlen = c_len;
6244     }
6245
6246     /* If this character is potentially in the bitmap, check it */
6247     if (c < 256) {
6248         if (ANYOF_BITMAP_TEST(n, c))
6249             match = TRUE;
6250         else if (flags & ANYOF_FOLD) {
6251             U8 f;
6252
6253             if (flags & ANYOF_LOCALE) {
6254                 PL_reg_flags |= RF_tainted;
6255                 f = PL_fold_locale[c];
6256             }
6257             else
6258                 f = PL_fold[c];
6259             if (f != c && ANYOF_BITMAP_TEST(n, f))
6260                 match = TRUE;
6261         }
6262
6263         if (!match && (flags & ANYOF_CLASS)) {
6264             PL_reg_flags |= RF_tainted;
6265             if (
6266                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6267                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6268                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6269                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6270                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6271                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6272                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6273                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6274                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6275                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6276                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6277                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6278                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6279                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6280                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6281                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6282                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6283                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6284                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6285                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6286                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6287                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6288                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6289                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6290                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6291                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6292                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6293                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6294                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6295                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6296                 ) /* How's that for a conditional? */
6297             {
6298                 match = TRUE;
6299             }
6300         }
6301     }
6302
6303     /* If the bitmap didn't (or couldn't) match, and something outside the
6304      * bitmap could match, try that */
6305     if (!match && (utf8_target || (flags & ANYOF_UNICODE))) {
6306         if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6307             match = TRUE;
6308         }
6309         else {
6310             AV *av;
6311             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6312
6313             if (sw) {
6314                 U8 * utf8_p;
6315                 if (utf8_target) {
6316                     utf8_p = (U8 *) p;
6317                 } else {
6318                     STRLEN len = 1;
6319                     utf8_p = bytes_to_utf8(p, &len);
6320                 }
6321                 if (swash_fetch(sw, utf8_p, 1))
6322                     match = TRUE;
6323                 else if (flags & ANYOF_FOLD) {
6324                     if (!match && lenp && av) {
6325                         I32 i;
6326                         for (i = 0; i <= av_len(av); i++) {
6327                             SV* const sv = *av_fetch(av, i, FALSE);
6328                             STRLEN len;
6329                             const char * const s = SvPV_const(sv, len);
6330                             if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) {
6331                                 *lenp = len;
6332                                 match = TRUE;
6333                                 break;
6334                             }
6335                         }
6336                     }
6337                     if (!match) {
6338                         U8 folded[UTF8_MAXBYTES_CASE+1];
6339
6340                         /* See if the folded version matches */
6341                         STRLEN foldlen;
6342                         to_utf8_fold(utf8_p, folded, &foldlen);
6343                         if (swash_fetch(sw, folded, 1)) {   /* 1 => is utf8 */
6344                             match = TRUE;
6345                         }
6346                         else {
6347                             SV** listp;
6348
6349                             /* Consider "k" =~ /[K]/i.  The line above would
6350                              * have just folded the 'k' to itself, and that
6351                              * isn't going to match 'K'.  So we look through
6352                              * the closure of everything that folds to 'k'.
6353                              * That will find the 'K'.  Initialize the list, if
6354                              * necessary */
6355                             if (! PL_utf8_foldclosures) {
6356
6357                                 /* If the folds haven't been read in, call a fold
6358                              * function to force that */
6359                                 if (! PL_utf8_tofold) {
6360                                     U8 dummy[UTF8_MAXBYTES+1];
6361                                     STRLEN dummy_len;
6362                                     to_utf8_fold((U8*) "A", dummy, &dummy_len);
6363                                 }
6364                                 PL_utf8_foldclosures =
6365                                         _swash_inversion_hash(PL_utf8_tofold);
6366                             }
6367
6368                             /* The data structure is a hash with the keys every
6369                              * character that is folded to, like 'k', and the
6370                              * values each an array of everything that folds to
6371                              * its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
6372                             if ((listp = hv_fetch(PL_utf8_foldclosures,
6373                                             (char *) folded, foldlen, FALSE)))
6374                             {
6375                                 AV* list = (AV*) *listp;
6376                                 IV i;
6377                                 for (i = 0; i <= av_len(list); i++) {
6378                                     SV** try_p = av_fetch(list, i, FALSE);
6379                                     if (try_p == NULL) {
6380                                         Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
6381                                     }
6382                                     /* Don't have to worry about embeded nulls
6383                                      * since NULL isn't folded or foldable */
6384                                     if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) {
6385                                         match = TRUE;
6386                                         break;
6387                                     }
6388                                 }
6389                             }
6390                         }
6391                     }
6392                 }
6393
6394                 /* If we allocated a string above, free it */
6395                 if (! utf8_target) Safefree(utf8_p);
6396             }
6397         }
6398     }
6399
6400     return (flags & ANYOF_INVERT) ? !match : match;
6401 }
6402
6403 STATIC U8 *
6404 S_reghop3(U8 *s, I32 off, const U8* lim)
6405 {
6406     dVAR;
6407
6408     PERL_ARGS_ASSERT_REGHOP3;
6409
6410     if (off >= 0) {
6411         while (off-- && s < lim) {
6412             /* XXX could check well-formedness here */
6413             s += UTF8SKIP(s);
6414         }
6415     }
6416     else {
6417         while (off++ && s > lim) {
6418             s--;
6419             if (UTF8_IS_CONTINUED(*s)) {
6420                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6421                     s--;
6422             }
6423             /* XXX could check well-formedness here */
6424         }
6425     }
6426     return s;
6427 }
6428
6429 #ifdef XXX_dmq
6430 /* there are a bunch of places where we use two reghop3's that should
6431    be replaced with this routine. but since thats not done yet
6432    we ifdef it out - dmq
6433 */
6434 STATIC U8 *
6435 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6436 {
6437     dVAR;
6438
6439     PERL_ARGS_ASSERT_REGHOP4;
6440
6441     if (off >= 0) {
6442         while (off-- && s < rlim) {
6443             /* XXX could check well-formedness here */
6444             s += UTF8SKIP(s);
6445         }
6446     }
6447     else {
6448         while (off++ && s > llim) {
6449             s--;
6450             if (UTF8_IS_CONTINUED(*s)) {
6451                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6452                     s--;
6453             }
6454             /* XXX could check well-formedness here */
6455         }
6456     }
6457     return s;
6458 }
6459 #endif
6460
6461 STATIC U8 *
6462 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6463 {
6464     dVAR;
6465
6466     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6467
6468     if (off >= 0) {
6469         while (off-- && s < lim) {
6470             /* XXX could check well-formedness here */
6471             s += UTF8SKIP(s);
6472         }
6473         if (off >= 0)
6474             return NULL;
6475     }
6476     else {
6477         while (off++ && s > lim) {
6478             s--;
6479             if (UTF8_IS_CONTINUED(*s)) {
6480                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6481                     s--;
6482             }
6483             /* XXX could check well-formedness here */
6484         }
6485         if (off <= 0)
6486             return NULL;
6487     }
6488     return s;
6489 }
6490
6491 static void
6492 restore_pos(pTHX_ void *arg)
6493 {
6494     dVAR;
6495     regexp * const rex = (regexp *)arg;
6496     if (PL_reg_eval_set) {
6497         if (PL_reg_oldsaved) {
6498             rex->subbeg = PL_reg_oldsaved;
6499             rex->sublen = PL_reg_oldsavedlen;
6500 #ifdef PERL_OLD_COPY_ON_WRITE
6501             rex->saved_copy = PL_nrs;
6502 #endif
6503             RXp_MATCH_COPIED_on(rex);
6504         }
6505         PL_reg_magic->mg_len = PL_reg_oldpos;
6506         PL_reg_eval_set = 0;
6507         PL_curpm = PL_reg_oldcurpm;
6508     }
6509 }
6510
6511 STATIC void
6512 S_to_utf8_substr(pTHX_ register regexp *prog)
6513 {
6514     int i = 1;
6515
6516     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6517
6518     do {
6519         if (prog->substrs->data[i].substr
6520             && !prog->substrs->data[i].utf8_substr) {
6521             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6522             prog->substrs->data[i].utf8_substr = sv;
6523             sv_utf8_upgrade(sv);
6524             if (SvVALID(prog->substrs->data[i].substr)) {
6525                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6526                 if (flags & FBMcf_TAIL) {
6527                     /* Trim the trailing \n that fbm_compile added last
6528                        time.  */
6529                     SvCUR_set(sv, SvCUR(sv) - 1);
6530                     /* Whilst this makes the SV technically "invalid" (as its
6531                        buffer is no longer followed by "\0") when fbm_compile()
6532                        adds the "\n" back, a "\0" is restored.  */
6533                 }
6534                 fbm_compile(sv, flags);
6535             }
6536             if (prog->substrs->data[i].substr == prog->check_substr)
6537                 prog->check_utf8 = sv;
6538         }
6539     } while (i--);
6540 }
6541
6542 STATIC void
6543 S_to_byte_substr(pTHX_ register regexp *prog)
6544 {
6545     dVAR;
6546     int i = 1;
6547
6548     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6549
6550     do {
6551         if (prog->substrs->data[i].utf8_substr
6552             && !prog->substrs->data[i].substr) {
6553             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6554             if (sv_utf8_downgrade(sv, TRUE)) {
6555                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6556                     const U8 flags
6557                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6558                     if (flags & FBMcf_TAIL) {
6559                         /* Trim the trailing \n that fbm_compile added last
6560                            time.  */
6561                         SvCUR_set(sv, SvCUR(sv) - 1);
6562                     }
6563                     fbm_compile(sv, flags);
6564                 }
6565             } else {
6566                 SvREFCNT_dec(sv);
6567                 sv = &PL_sv_undef;
6568             }
6569             prog->substrs->data[i].substr = sv;
6570             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6571                 prog->check_substr = sv;
6572         }
6573     } while (i--);
6574 }
6575
6576 /*
6577  * Local variables:
6578  * c-indentation-style: bsd
6579  * c-basic-offset: 4
6580  * indent-tabs-mode: t
6581  * End:
6582  *
6583  * ex: set ts=8 sts=4 sw=4 noet:
6584  */