src/5013008/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted      1               /* tainted information used? */
  85 #define RF_warned       2               /* warned about big count? */
  86
  87 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init         1               /* eval environment created */
  92 #define RS_set          2               /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC  static
  96 #endif
  97
  98 /* Valid for non-utf8 strings only: avoids the reginclass call if there are no
  99  * complications: i.e., if everything matchable is straight forward in the
 100  * bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102                                               : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112         (char *)(PL_reg_match_utf8 \
 113             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114             : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116         (char*)(PL_reg_match_utf8\
 117             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118             : (pos - off >= PL_bostr)           \
 119                 ? (U8*)pos - off                \
 120                 : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 127     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 128
 129 /* Doesn't do an assert to verify that is correct */
 130 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 131     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 132
 133 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 134 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 135 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 136
 137 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 138         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 139         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 140         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 141             * assert should likely and hopefully fail on an EBCDIC machine */ \
 142         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 143                                                                             \
 144         /* No asserts are done for these, in case called on an early        \
 145             * Unicode version in which they map to nothing */               \
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 148         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 149         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 150         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 151         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 152         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 153
 154 /*
 155    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 156    so that it is possible to override the option here without having to
 157    rebuild the entire core. as we are required to do if we change regcomp.h
 158    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 159 */
 160 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 161 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 162 #endif
 163
 164 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 165 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 166 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 167 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 168 #define RE_utf8_perl_word   PL_utf8_alnum
 169 #define RE_utf8_perl_space  PL_utf8_space
 170 #define RE_utf8_posix_digit PL_utf8_digit
 171 #define perl_word  alnum
 172 #define perl_space space
 173 #define posix_digit digit
 174 #else
 175 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 176 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 177 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 178 #define RE_utf8_perl_word   PL_utf8_perl_word
 179 #define RE_utf8_perl_space  PL_utf8_perl_space
 180 #define RE_utf8_posix_digit PL_utf8_posix_digit
 181 #endif
 182
 183
 184 #define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)          \
 185         case NAMEL:                                                         \
 186             PL_reg_flags |= RF_tainted;                                     \
 187             /* FALL THROUGH */                                              \
 188         case NAME:                                                          \
 189             if (!nextchr)                                                   \
 190                 sayNO;                                                      \
 191             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                \
 192                 if (!CAT2(PL_utf8_,CLASS)) {                                \
 193                     bool ok;                                                \
 194                     ENTER;                                                  \
 195                     save_re_context();                                      \
 196                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                \
 197                     assert(ok);                                             \
 198                     LEAVE;                                                  \
 199                 }                                                           \
 200                 if (!(OP(scan) == NAME                                      \
 201                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 202                     : LCFUNC_utf8((U8*)locinput)))                          \
 203                 {                                                           \
 204                     sayNO;                                                  \
 205                 }                                                           \
 206                 locinput += PL_utf8skip[nextchr];                           \
 207                 nextchr = UCHARAT(locinput);                                \
 208                 break;                                                      \
 209             }                                                               \
 210             /* Drops through to the macro that calls this one */
 211
 212 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)           \
 213     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)              \
 214             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))      \
 215                 sayNO;                                                      \
 216             nextchr = UCHARAT(++locinput);                                  \
 217             break
 218
 219 /* Almost identical to the above, but has a case for a node that matches chars
 220  * between 128 and 255 using Unicode (latin1) semantics. */
 221 #define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 222     _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 223             if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 224                 sayNO;                                                       \
 225             nextchr = UCHARAT(++locinput);                                   \
 226             break
 227
 228 #define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)           \
 229         case NAMEL:                                                          \
 230             PL_reg_flags |= RF_tainted;                                      \
 231             /* FALL THROUGH */                                               \
 232         case NAME :                                                          \
 233             if (!nextchr && locinput >= PL_regeol)                           \
 234                 sayNO;                                                       \
 235             if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                 \
 236                 if (!CAT2(PL_utf8_,CLASS)) {                                 \
 237                     bool ok;                                                 \
 238                     ENTER;                                                   \
 239                     save_re_context();                                       \
 240                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                 \
 241                     assert(ok);                                              \
 242                     LEAVE;                                                   \
 243                 }                                                            \
 244                 if ((OP(scan) == NAME                                        \
 245                     ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target))  \
 246                     : LCFUNC_utf8((U8*)locinput)))                           \
 247                 {                                                            \
 248                     sayNO;                                                   \
 249                 }                                                            \
 250                 locinput += PL_utf8skip[nextchr];                            \
 251                 nextchr = UCHARAT(locinput);                                 \
 252                 break;                                                       \
 253             }
 254
 255 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)            \
 256     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC)               \
 257             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))        \
 258                 sayNO;                                                       \
 259             nextchr = UCHARAT(++locinput);                                   \
 260             break
 261
 262
 263 #define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC)         \
 264     _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU)              \
 265             if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
 266                 sayNO;                                                       \
 267             nextchr = UCHARAT(++locinput);                                   \
 268             break
 269
 270
 271
 272 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 273
 274 /* for use after a quantifier and before an EXACT-like node -- japhy */
 275 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 276  *
 277  * NOTE that *nothing* that affects backtracking should be in here, specifically
 278  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 279  * node that is in between two EXACT like nodes when ascertaining what the required
 280  * "follow" character is. This should probably be moved to regex compile time
 281  * although it may be done at run time beause of the REF possibility - more
 282  * investigation required. -- demerphq
 283 */
 284 #define JUMPABLE(rn) (      \
 285     OP(rn) == OPEN ||       \
 286     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 287     OP(rn) == EVAL ||   \
 288     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 289     OP(rn) == PLUS || OP(rn) == MINMOD || \
 290     OP(rn) == KEEPS || \
 291     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 292 )
 293 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 294
 295 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 296
 297 #if 0
 298 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 299    we don't need this definition. */
 300 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 301 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 302 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 303
 304 #else
 305 /* ... so we use this as its faster. */
 306 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 307 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU )
 308 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 309 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 310
 311 #endif
 312
 313 /*
 314   Search for mandatory following text node; for lookahead, the text must
 315   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 316 */
 317 #define FIND_NEXT_IMPT(rn) STMT_START { \
 318     while (JUMPABLE(rn)) { \
 319         const OPCODE type = OP(rn); \
 320         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 321             rn = NEXTOPER(NEXTOPER(rn)); \
 322         else if (type == PLUS) \
 323             rn = NEXTOPER(rn); \
 324         else if (type == IFMATCH) \
 325             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 326         else rn += NEXT_OFF(rn); \
 327     } \
 328 } STMT_END
 329
 330
 331 static void restore_pos(pTHX_ void *arg);
 332
 333 #define REGCP_PAREN_ELEMS 4
 334 #define REGCP_OTHER_ELEMS 5
 335 #define REGCP_FRAME_ELEMS 1
 336 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 337  * are needed for the regexp context stack bookkeeping. */
 338
 339 STATIC CHECKPOINT
 340 S_regcppush(pTHX_ I32 parenfloor)
 341 {
 342     dVAR;
 343     const int retval = PL_savestack_ix;
 344     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 345     const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 346     const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 347     int p;
 348     GET_RE_DEBUG_FLAGS_DECL;
 349
 350     if (paren_elems_to_push < 0)
 351         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 352
 353     if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 354         Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 355                    " out of range (%lu-%ld)",
 356                    total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 357
 358     SSGROW(total_elems + REGCP_FRAME_ELEMS);
 359
 360     for (p = PL_regsize; p > parenfloor; p--) {
 361 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 362         SSPUSHINT(PL_regoffs[p].end);
 363         SSPUSHINT(PL_regoffs[p].start);
 364         SSPUSHPTR(PL_reg_start_tmp[p]);
 365         SSPUSHINT(p);
 366         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 367           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 368                       (UV)p, (IV)PL_regoffs[p].start,
 369                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 370                       (IV)PL_regoffs[p].end
 371         ));
 372     }
 373 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 374     SSPUSHPTR(PL_regoffs);
 375     SSPUSHINT(PL_regsize);
 376     SSPUSHINT(*PL_reglastparen);
 377     SSPUSHINT(*PL_reglastcloseparen);
 378     SSPUSHPTR(PL_reginput);
 379     SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 380
 381     return retval;
 382 }
 383
 384 /* These are needed since we do not localize EVAL nodes: */
 385 #define REGCP_SET(cp)                                           \
 386     DEBUG_STATE_r(                                              \
 387             PerlIO_printf(Perl_debug_log,                       \
 388                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 389                 (IV)PL_savestack_ix));                          \
 390     cp = PL_savestack_ix
 391
 392 #define REGCP_UNWIND(cp)                                        \
 393     DEBUG_STATE_r(                                              \
 394         if (cp != PL_savestack_ix)                              \
 395             PerlIO_printf(Perl_debug_log,                       \
 396                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 397                 (IV)(cp), (IV)PL_savestack_ix));                \
 398     regcpblow(cp)
 399
 400 STATIC char *
 401 S_regcppop(pTHX_ const regexp *rex)
 402 {
 403     dVAR;
 404     UV i;
 405     char *input;
 406     GET_RE_DEBUG_FLAGS_DECL;
 407
 408     PERL_ARGS_ASSERT_REGCPPOP;
 409
 410     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 411     i = SSPOPUV;
 412     assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 413     i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 414     input = (char *) SSPOPPTR;
 415     *PL_reglastcloseparen = SSPOPINT;
 416     *PL_reglastparen = SSPOPINT;
 417     PL_regsize = SSPOPINT;
 418     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 419
 420     i -= REGCP_OTHER_ELEMS;
 421     /* Now restore the parentheses context. */
 422     for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 423         I32 tmps;
 424         U32 paren = (U32)SSPOPINT;
 425         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 426         PL_regoffs[paren].start = SSPOPINT;
 427         tmps = SSPOPINT;
 428         if (paren <= *PL_reglastparen)
 429             PL_regoffs[paren].end = tmps;
 430         DEBUG_BUFFERS_r(
 431             PerlIO_printf(Perl_debug_log,
 432                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 433                           (UV)paren, (IV)PL_regoffs[paren].start,
 434                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 435                           (IV)PL_regoffs[paren].end,
 436                           (paren > *PL_reglastparen ? "(no)" : ""));
 437         );
 438     }
 439     DEBUG_BUFFERS_r(
 440         if (*PL_reglastparen + 1 <= rex->nparens) {
 441             PerlIO_printf(Perl_debug_log,
 442                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 443                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 444         }
 445     );
 446 #if 1
 447     /* It would seem that the similar code in regtry()
 448      * already takes care of this, and in fact it is in
 449      * a better location to since this code can #if 0-ed out
 450      * but the code in regtry() is needed or otherwise tests
 451      * requiring null fields (pat.t#187 and split.t#{13,14}
 452      * (as of patchlevel 7877)  will fail.  Then again,
 453      * this code seems to be necessary or otherwise
 454      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 455      * --jhi updated by dapm */
 456     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 457         if (i > PL_regsize)
 458             PL_regoffs[i].start = -1;
 459         PL_regoffs[i].end = -1;
 460     }
 461 #endif
 462     return input;
 463 }
 464
 465 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 466
 467 /*
 468  * pregexec and friends
 469  */
 470
 471 #ifndef PERL_IN_XSUB_RE
 472 /*
 473  - pregexec - match a regexp against a string
 474  */
 475 I32
 476 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 477          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 478 /* strend: pointer to null at end of string */
 479 /* strbeg: real beginning of string */
 480 /* minend: end of match must be >=minend after stringarg. */
 481 /* nosave: For optimizations. */
 482 {
 483     PERL_ARGS_ASSERT_PREGEXEC;
 484
 485     return
 486         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 487                       nosave ? 0 : REXEC_COPY_STR);
 488 }
 489 #endif
 490
 491 /*
 492  * Need to implement the following flags for reg_anch:
 493  *
 494  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 495  * USE_INTUIT_ML
 496  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 497  * INTUIT_AUTORITATIVE_ML
 498  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 499  * INTUIT_ONCE_ML
 500  *
 501  * Another flag for this function: SECOND_TIME (so that float substrs
 502  * with giant delta may be not rechecked).
 503  */
 504
 505 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 506
 507 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 508    Otherwise, only SvCUR(sv) is used to get strbeg. */
 509
 510 /* XXXX We assume that strpos is strbeg unless sv. */
 511
 512 /* XXXX Some places assume that there is a fixed substring.
 513         An update may be needed if optimizer marks as "INTUITable"
 514         RExen without fixed substrings.  Similarly, it is assumed that
 515         lengths of all the strings are no more than minlen, thus they
 516         cannot come from lookahead.
 517         (Or minlen should take into account lookahead.)
 518   NOTE: Some of this comment is not correct. minlen does now take account
 519   of lookahead/behind. Further research is required. -- demerphq
 520
 521 */
 522
 523 /* A failure to find a constant substring means that there is no need to make
 524    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 525    finding a substring too deep into the string means that less calls to
 526    regtry() should be needed.
 527
 528    REx compiler's optimizer found 4 possible hints:
 529         a) Anchored substring;
 530         b) Fixed substring;
 531         c) Whether we are anchored (beginning-of-line or \G);
 532         d) First node (of those at offset 0) which may distingush positions;
 533    We use a)b)d) and multiline-part of c), and try to find a position in the
 534    string which does not contradict any of them.
 535  */
 536
 537 /* Most of decisions we do here should have been done at compile time.
 538    The nodes of the REx which we used for the search should have been
 539    deleted from the finite automaton. */
 540
 541 char *
 542 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 543                      char *strend, const U32 flags, re_scream_pos_data *data)
 544 {
 545     dVAR;
 546     struct regexp *const prog = (struct regexp *)SvANY(rx);
 547     register I32 start_shift = 0;
 548     /* Should be nonnegative! */
 549     register I32 end_shift   = 0;
 550     register char *s;
 551     register SV *check;
 552     char *strbeg;
 553     char *t;
 554     const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 555     I32 ml_anch;
 556     register char *other_last = NULL;   /* other substr checked before this */
 557     char *check_at = NULL;              /* check substr found at this pos */
 558     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 559     RXi_GET_DECL(prog,progi);
 560 #ifdef DEBUGGING
 561     const char * const i_strpos = strpos;
 562 #endif
 563     GET_RE_DEBUG_FLAGS_DECL;
 564
 565     PERL_ARGS_ASSERT_RE_INTUIT_START;
 566
 567     RX_MATCH_UTF8_set(rx,utf8_target);
 568
 569     if (RX_UTF8(rx)) {
 570         PL_reg_flags |= RF_utf8;
 571     }
 572     DEBUG_EXECUTE_r(
 573         debug_start_match(rx, utf8_target, strpos, strend,
 574             sv ? "Guessing start of match in sv for"
 575                : "Guessing start of match in string for");
 576               );
 577
 578     /* CHR_DIST() would be more correct here but it makes things slow. */
 579     if (prog->minlen > strend - strpos) {
 580         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 581                               "String too short... [re_intuit_start]\n"));
 582         goto fail;
 583     }
 584
 585     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 586     PL_regeol = strend;
 587     if (utf8_target) {
 588         if (!prog->check_utf8 && prog->check_substr)
 589             to_utf8_substr(prog);
 590         check = prog->check_utf8;
 591     } else {
 592         if (!prog->check_substr && prog->check_utf8)
 593             to_byte_substr(prog);
 594         check = prog->check_substr;
 595     }
 596     if (check == &PL_sv_undef) {
 597         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 598                 "Non-utf8 string cannot match utf8 check string\n"));
 599         goto fail;
 600     }
 601     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 602         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 603                      || ( (prog->extflags & RXf_ANCH_BOL)
 604                           && !multiline ) );    /* Check after \n? */
 605
 606         if (!ml_anch) {
 607           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 608                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 609                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 610                && sv && !SvROK(sv)
 611                && (strpos != strbeg)) {
 612               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 613               goto fail;
 614           }
 615           if (prog->check_offset_min == prog->check_offset_max &&
 616               !(prog->extflags & RXf_CANY_SEEN)) {
 617             /* Substring at constant offset from beg-of-str... */
 618             I32 slen;
 619
 620             s = HOP3c(strpos, prog->check_offset_min, strend);
 621
 622             if (SvTAIL(check)) {
 623                 slen = SvCUR(check);    /* >= 1 */
 624
 625                 if ( strend - s > slen || strend - s < slen - 1
 626                      || (strend - s == slen && strend[-1] != '\n')) {
 627                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 628                     goto fail_finish;
 629                 }
 630                 /* Now should match s[0..slen-2] */
 631                 slen--;
 632                 if (slen && (*SvPVX_const(check) != *s
 633                              || (slen > 1
 634                                  && memNE(SvPVX_const(check), s, slen)))) {
 635                   report_neq:
 636                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 637                     goto fail_finish;
 638                 }
 639             }
 640             else if (*SvPVX_const(check) != *s
 641                      || ((slen = SvCUR(check)) > 1
 642                          && memNE(SvPVX_const(check), s, slen)))
 643                 goto report_neq;
 644             check_at = s;
 645             goto success_at_start;
 646           }
 647         }
 648         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 649         s = strpos;
 650         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 651         end_shift = prog->check_end_shift;
 652
 653         if (!ml_anch) {
 654             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 655                                          - (SvTAIL(check) != 0);
 656             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 657
 658             if (end_shift < eshift)
 659                 end_shift = eshift;
 660         }
 661     }
 662     else {                              /* Can match at random position */
 663         ml_anch = 0;
 664         s = strpos;
 665         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 666         end_shift = prog->check_end_shift;
 667
 668         /* end shift should be non negative here */
 669     }
 670
 671 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 672     if (end_shift < 0)
 673         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 674                    (IV)end_shift, RX_PRECOMP(prog));
 675 #endif
 676
 677   restart:
 678     /* Find a possible match in the region s..strend by looking for
 679        the "check" substring in the region corrected by start/end_shift. */
 680
 681     {
 682         I32 srch_start_shift = start_shift;
 683         I32 srch_end_shift = end_shift;
 684         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 685             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 686             srch_start_shift = strbeg - s;
 687         }
 688     DEBUG_OPTIMISE_MORE_r({
 689         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 690             (IV)prog->check_offset_min,
 691             (IV)srch_start_shift,
 692             (IV)srch_end_shift,
 693             (IV)prog->check_end_shift);
 694     });
 695
 696     if (flags & REXEC_SCREAM) {
 697         I32 p = -1;                     /* Internal iterator of scream. */
 698         I32 * const pp = data ? data->scream_pos : &p;
 699
 700         if (PL_screamfirst[BmRARE(check)] >= 0
 701             || ( BmRARE(check) == '\n'
 702                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 703                  && SvTAIL(check) ))
 704             s = screaminstr(sv, check,
 705                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 706         else
 707             goto fail_finish;
 708         /* we may be pointing at the wrong string */
 709         if (s && RXp_MATCH_COPIED(prog))
 710             s = strbeg + (s - SvPVX_const(sv));
 711         if (data)
 712             *data->scream_olds = s;
 713     }
 714     else {
 715         U8* start_point;
 716         U8* end_point;
 717         if (prog->extflags & RXf_CANY_SEEN) {
 718             start_point= (U8*)(s + srch_start_shift);
 719             end_point= (U8*)(strend - srch_end_shift);
 720         } else {
 721             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 722             end_point= HOP3(strend, -srch_end_shift, strbeg);
 723         }
 724         DEBUG_OPTIMISE_MORE_r({
 725             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 726                 (int)(end_point - start_point),
 727                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 728                 start_point);
 729         });
 730
 731         s = fbm_instr( start_point, end_point,
 732                       check, multiline ? FBMrf_MULTILINE : 0);
 733     }
 734     }
 735     /* Update the count-of-usability, remove useless subpatterns,
 736         unshift s.  */
 737
 738     DEBUG_EXECUTE_r({
 739         RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 740             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 741         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 742                           (s ? "Found" : "Did not find"),
 743             (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 744                 ? "anchored" : "floating"),
 745             quoted,
 746             RE_SV_TAIL(check),
 747             (s ? " at offset " : "...\n") );
 748     });
 749
 750     if (!s)
 751         goto fail_finish;
 752     /* Finish the diagnostic message */
 753     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 754
 755     /* XXX dmq: first branch is for positive lookbehind...
 756        Our check string is offset from the beginning of the pattern.
 757        So we need to do any stclass tests offset forward from that
 758        point. I think. :-(
 759      */
 760
 761
 762
 763     check_at=s;
 764
 765
 766     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 767        Start with the other substr.
 768        XXXX no SCREAM optimization yet - and a very coarse implementation
 769        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 770                 *always* match.  Probably should be marked during compile...
 771        Probably it is right to do no SCREAM here...
 772      */
 773
 774     if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 775                 : (prog->float_substr && prog->anchored_substr))
 776     {
 777         /* Take into account the "other" substring. */
 778         /* XXXX May be hopelessly wrong for UTF... */
 779         if (!other_last)
 780             other_last = strpos;
 781         if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 782           do_other_anchored:
 783             {
 784                 char * const last = HOP3c(s, -start_shift, strbeg);
 785                 char *last1, *last2;
 786                 char * const saved_s = s;
 787                 SV* must;
 788
 789                 t = s - prog->check_offset_max;
 790                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 791                     && (!utf8_target
 792                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 793                             && t > strpos)))
 794                     NOOP;
 795                 else
 796                     t = strpos;
 797                 t = HOP3c(t, prog->anchored_offset, strend);
 798                 if (t < other_last)     /* These positions already checked */
 799                     t = other_last;
 800                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 801                 if (last < last1)
 802                     last1 = last;
 803                 /* XXXX It is not documented what units *_offsets are in.
 804                    We assume bytes, but this is clearly wrong.
 805                    Meaning this code needs to be carefully reviewed for errors.
 806                    dmq.
 807                   */
 808
 809                 /* On end-of-str: see comment below. */
 810                 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 811                 if (must == &PL_sv_undef) {
 812                     s = (char*)NULL;
 813                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 814                 }
 815                 else
 816                     s = fbm_instr(
 817                         (unsigned char*)t,
 818                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 819                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 820                         must,
 821                         multiline ? FBMrf_MULTILINE : 0
 822                     );
 823                 DEBUG_EXECUTE_r({
 824                     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 825                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 826                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 827                         (s ? "Found" : "Contradicts"),
 828                         quoted, RE_SV_TAIL(must));
 829                 });
 830
 831
 832                 if (!s) {
 833                     if (last1 >= last2) {
 834                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 835                                                 ", giving up...\n"));
 836                         goto fail_finish;
 837                     }
 838                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 839                         ", trying floating at offset %ld...\n",
 840                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 841                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 842                     s = HOP3c(last, 1, strend);
 843                     goto restart;
 844                 }
 845                 else {
 846                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 847                           (long)(s - i_strpos)));
 848                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 849                     other_last = HOP3c(s, 1, strend);
 850                     s = saved_s;
 851                     if (t == strpos)
 852                         goto try_at_start;
 853                     goto try_at_offset;
 854                 }
 855             }
 856         }
 857         else {          /* Take into account the floating substring. */
 858             char *last, *last1;
 859             char * const saved_s = s;
 860             SV* must;
 861
 862             t = HOP3c(s, -start_shift, strbeg);
 863             last1 = last =
 864                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 865             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 866                 last = HOP3c(t, prog->float_max_offset, strend);
 867             s = HOP3c(t, prog->float_min_offset, strend);
 868             if (s < other_last)
 869                 s = other_last;
 870  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 871             must = utf8_target ? prog->float_utf8 : prog->float_substr;
 872             /* fbm_instr() takes into account exact value of end-of-str
 873                if the check is SvTAIL(ed).  Since false positives are OK,
 874                and end-of-str is not later than strend we are OK. */
 875             if (must == &PL_sv_undef) {
 876                 s = (char*)NULL;
 877                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 878             }
 879             else
 880                 s = fbm_instr((unsigned char*)s,
 881                               (unsigned char*)last + SvCUR(must)
 882                                   - (SvTAIL(must)!=0),
 883                               must, multiline ? FBMrf_MULTILINE : 0);
 884             DEBUG_EXECUTE_r({
 885                 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 886                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 887                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 888                     (s ? "Found" : "Contradicts"),
 889                     quoted, RE_SV_TAIL(must));
 890             });
 891             if (!s) {
 892                 if (last1 == last) {
 893                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 894                                             ", giving up...\n"));
 895                     goto fail_finish;
 896                 }
 897                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 898                     ", trying anchored starting at offset %ld...\n",
 899                     (long)(saved_s + 1 - i_strpos)));
 900                 other_last = last;
 901                 s = HOP3c(t, 1, strend);
 902                 goto restart;
 903             }
 904             else {
 905                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 906                       (long)(s - i_strpos)));
 907                 other_last = s; /* Fix this later. --Hugo */
 908                 s = saved_s;
 909                 if (t == strpos)
 910                     goto try_at_start;
 911                 goto try_at_offset;
 912             }
 913         }
 914     }
 915
 916
 917     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 918
 919     DEBUG_OPTIMISE_MORE_r(
 920         PerlIO_printf(Perl_debug_log,
 921             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 922             (IV)prog->check_offset_min,
 923             (IV)prog->check_offset_max,
 924             (IV)(s-strpos),
 925             (IV)(t-strpos),
 926             (IV)(t-s),
 927             (IV)(strend-strpos)
 928         )
 929     );
 930
 931     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 932         && (!utf8_target
 933             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 934                  && t > strpos)))
 935     {
 936         /* Fixed substring is found far enough so that the match
 937            cannot start at strpos. */
 938       try_at_offset:
 939         if (ml_anch && t[-1] != '\n') {
 940             /* Eventually fbm_*() should handle this, but often
 941                anchored_offset is not 0, so this check will not be wasted. */
 942             /* XXXX In the code below we prefer to look for "^" even in
 943                presence of anchored substrings.  And we search even
 944                beyond the found float position.  These pessimizations
 945                are historical artefacts only.  */
 946           find_anchor:
 947             while (t < strend - prog->minlen) {
 948                 if (*t == '\n') {
 949                     if (t < check_at - prog->check_offset_min) {
 950                         if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 951                             /* Since we moved from the found position,
 952                                we definitely contradict the found anchored
 953                                substr.  Due to the above check we do not
 954                                contradict "check" substr.
 955                                Thus we can arrive here only if check substr
 956                                is float.  Redo checking for "other"=="fixed".
 957                              */
 958                             strpos = t + 1;
 959                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 960                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 961                             goto do_other_anchored;
 962                         }
 963                         /* We don't contradict the found floating substring. */
 964                         /* XXXX Why not check for STCLASS? */
 965                         s = t + 1;
 966                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 967                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 968                         goto set_useful;
 969                     }
 970                     /* Position contradicts check-string */
 971                     /* XXXX probably better to look for check-string
 972                        than for "\n", so one should lower the limit for t? */
 973                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 974                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 975                     other_last = strpos = s = t + 1;
 976                     goto restart;
 977                 }
 978                 t++;
 979             }
 980             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 981                         PL_colors[0], PL_colors[1]));
 982             goto fail_finish;
 983         }
 984         else {
 985             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 986                         PL_colors[0], PL_colors[1]));
 987         }
 988         s = t;
 989       set_useful:
 990         ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr);        /* hooray/5 */
 991     }
 992     else {
 993         /* The found string does not prohibit matching at strpos,
 994            - no optimization of calling REx engine can be performed,
 995            unless it was an MBOL and we are not after MBOL,
 996            or a future STCLASS check will fail this. */
 997       try_at_start:
 998         /* Even in this situation we may use MBOL flag if strpos is offset
 999            wrt the start of the string. */
1000         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1001             && (strpos != strbeg) && strpos[-1] != '\n'
1002             /* May be due to an implicit anchor of m{.*foo}  */
1003             && !(prog->intflags & PREGf_IMPLICIT))
1004         {
1005             t = strpos;
1006             goto find_anchor;
1007         }
1008         DEBUG_EXECUTE_r( if (ml_anch)
1009             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1010                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1011         );
1012       success_at_start:
1013         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
1014             && (utf8_target ? (
1015                 prog->check_utf8                /* Could be deleted already */
1016                 && --BmUSEFUL(prog->check_utf8) < 0
1017                 && (prog->check_utf8 == prog->float_utf8)
1018             ) : (
1019                 prog->check_substr              /* Could be deleted already */
1020                 && --BmUSEFUL(prog->check_substr) < 0
1021                 && (prog->check_substr == prog->float_substr)
1022             )))
1023         {
1024             /* If flags & SOMETHING - do not do it many times on the same match */
1025             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1026             /* XXX Does the destruction order has to change with utf8_target? */
1027             SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1028             SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1029             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
1030             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
1031             check = NULL;                       /* abort */
1032             s = strpos;
1033             /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag
1034                     see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1035             if (prog->intflags & PREGf_IMPLICIT)
1036                 prog->extflags &= ~RXf_ANCH_MBOL;
1037             /* XXXX This is a remnant of the old implementation.  It
1038                     looks wasteful, since now INTUIT can use many
1039                     other heuristics. */
1040             prog->extflags &= ~RXf_USE_INTUIT;
1041             /* XXXX What other flags might need to be cleared in this branch? */
1042         }
1043         else
1044             s = strpos;
1045     }
1046
1047     /* Last resort... */
1048     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1049     /* trie stclasses are too expensive to use here, we are better off to
1050        leave it to regmatch itself */
1051     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1052         /* minlen == 0 is possible if regstclass is \b or \B,
1053            and the fixed substr is ''$.
1054            Since minlen is already taken into account, s+1 is before strend;
1055            accidentally, minlen >= 1 guaranties no false positives at s + 1
1056            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1057            regstclass does not come from lookahead...  */
1058         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1059            This leaves EXACTF, EXACTFU only, which are dealt with in find_byclass().  */
1060         const U8* const str = (U8*)STRING(progi->regstclass);
1061         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1062                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1063                     : 1);
1064         char * endpos;
1065         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1066             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1067         else if (prog->float_substr || prog->float_utf8)
1068             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1069         else
1070             endpos= strend;
1071
1072         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1073                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1074
1075         t = s;
1076         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1077         if (!s) {
1078 #ifdef DEBUGGING
1079             const char *what = NULL;
1080 #endif
1081             if (endpos == strend) {
1082                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1083                                 "Could not match STCLASS...\n") );
1084                 goto fail;
1085             }
1086             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1087                                    "This position contradicts STCLASS...\n") );
1088             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1089                 goto fail;
1090             /* Contradict one of substrings */
1091             if (prog->anchored_substr || prog->anchored_utf8) {
1092                 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1093                     DEBUG_EXECUTE_r( what = "anchored" );
1094                   hop_and_restart:
1095                     s = HOP3c(t, 1, strend);
1096                     if (s + start_shift + end_shift > strend) {
1097                         /* XXXX Should be taken into account earlier? */
1098                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1099                                                "Could not match STCLASS...\n") );
1100                         goto fail;
1101                     }
1102                     if (!check)
1103                         goto giveup;
1104                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1105                                 "Looking for %s substr starting at offset %ld...\n",
1106                                  what, (long)(s + start_shift - i_strpos)) );
1107                     goto restart;
1108                 }
1109                 /* Have both, check_string is floating */
1110                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1111                     goto retry_floating_check;
1112                 /* Recheck anchored substring, but not floating... */
1113                 s = check_at;
1114                 if (!check)
1115                     goto giveup;
1116                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1117                           "Looking for anchored substr starting at offset %ld...\n",
1118                           (long)(other_last - i_strpos)) );
1119                 goto do_other_anchored;
1120             }
1121             /* Another way we could have checked stclass at the
1122                current position only: */
1123             if (ml_anch) {
1124                 s = t = t + 1;
1125                 if (!check)
1126                     goto giveup;
1127                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1128                           "Looking for /%s^%s/m starting at offset %ld...\n",
1129                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1130                 goto try_at_offset;
1131             }
1132             if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1133                 goto fail;
1134             /* Check is floating subtring. */
1135           retry_floating_check:
1136             t = check_at - start_shift;
1137             DEBUG_EXECUTE_r( what = "floating" );
1138             goto hop_and_restart;
1139         }
1140         if (t != s) {
1141             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1142                         "By STCLASS: moving %ld --> %ld\n",
1143                                   (long)(t - i_strpos), (long)(s - i_strpos))
1144                    );
1145         }
1146         else {
1147             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1148                                   "Does not contradict STCLASS...\n");
1149                    );
1150         }
1151     }
1152   giveup:
1153     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1154                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1155                           PL_colors[5], (long)(s - i_strpos)) );
1156     return s;
1157
1158   fail_finish:                          /* Substring not found */
1159     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1160         BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1161   fail:
1162     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1163                           PL_colors[4], PL_colors[5]));
1164     return NULL;
1165 }
1166
1167 #define DECL_TRIE_TYPE(scan) \
1168     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1169                     trie_type = (scan->flags != EXACT) \
1170                               ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1171                               : (utf8_target ? trie_utf8 : trie_plain)
1172
1173 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1174 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1175     switch (trie_type) {                                                    \
1176     case trie_utf8_fold:                                                    \
1177         if ( foldlen>0 ) {                                                  \
1178             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1179             foldlen -= len;                                                 \
1180             uscan += len;                                                   \
1181             len=0;                                                          \
1182         } else {                                                            \
1183             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1184             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1185             foldlen -= UNISKIP( uvc );                                      \
1186             uscan = foldbuf + UNISKIP( uvc );                               \
1187         }                                                                   \
1188         break;                                                              \
1189     case trie_latin_utf8_fold:                                              \
1190         if ( foldlen>0 ) {                                                  \
1191             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1192             foldlen -= len;                                                 \
1193             uscan += len;                                                   \
1194             len=0;                                                          \
1195         } else {                                                            \
1196             len = 1;                                                        \
1197             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1198             foldlen -= UNISKIP( uvc );                                      \
1199             uscan = foldbuf + UNISKIP( uvc );                               \
1200         }                                                                   \
1201         break;                                                              \
1202     case trie_utf8:                                                         \
1203         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1204         break;                                                              \
1205     case trie_plain:                                                        \
1206         uvc = (UV)*uc;                                                      \
1207         len = 1;                                                            \
1208     }                                                                       \
1209     if (uvc < 256) {                                                        \
1210         charid = trie->charmap[ uvc ];                                      \
1211     }                                                                       \
1212     else {                                                                  \
1213         charid = 0;                                                         \
1214         if (widecharmap) {                                                  \
1215             SV** const svpp = hv_fetch(widecharmap,                         \
1216                         (char*)&uvc, sizeof(UV), 0);                        \
1217             if (svpp)                                                       \
1218                 charid = (U16)SvIV(*svpp);                                  \
1219         }                                                                   \
1220     }                                                                       \
1221 } STMT_END
1222
1223 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1224 {                                                      \
1225     char *my_strend= (char *)strend;                   \
1226     if ( (CoNd)                                        \
1227          && (ln == len ||                              \
1228              foldEQ_utf8(s, &my_strend, 0,  utf8_target,   \
1229                         m, NULL, ln, cBOOL(UTF_PATTERN)))      \
1230          && (!reginfo || regtry(reginfo, &s)) )        \
1231         goto got_it;                                   \
1232     else {                                             \
1233          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1234          uvchr_to_utf8(tmpbuf, c);                     \
1235          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1236          if ( f != c                                   \
1237               && (f == c1 || f == c2)                  \
1238               && (ln == len ||                         \
1239                 foldEQ_utf8(s, &my_strend, 0,  utf8_target,\
1240                               m, NULL, ln, cBOOL(UTF_PATTERN)))\
1241               && (!reginfo || regtry(reginfo, &s)) )   \
1242               goto got_it;                             \
1243     }                                                  \
1244 }                                                      \
1245 s += len
1246
1247 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1248 STMT_START {                                              \
1249     re_fold_t folder;                                   \
1250     switch (OP(c)) {                                      \
1251         case EXACTFU: folder = foldEQ_latin1; break;      \
1252         case EXACTFL: folder = foldEQ_locale; break;      \
1253         case EXACTF:  folder = foldEQ; break;             \
1254         default:                                          \
1255             Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \
1256     }                                                     \
1257     while (s <= e) {                                      \
1258         if ( (CoNd)                                       \
1259              && (ln == 1 || folder(s, m, ln))             \
1260              && (!reginfo || regtry(reginfo, &s)) )       \
1261             goto got_it;                                  \
1262         s++;                                              \
1263     }                                                     \
1264 } STMT_END
1265
1266 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1267 STMT_START {                                          \
1268     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1269         CoDe                                          \
1270         s += uskip;                                   \
1271     }                                                 \
1272 } STMT_END
1273
1274 #define REXEC_FBC_SCAN(CoDe)                          \
1275 STMT_START {                                          \
1276     while (s < strend) {                              \
1277         CoDe                                          \
1278         s++;                                          \
1279     }                                                 \
1280 } STMT_END
1281
1282 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1283 REXEC_FBC_UTF8_SCAN(                                  \
1284     if (CoNd) {                                       \
1285         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1286             goto got_it;                              \
1287         else                                          \
1288             tmp = doevery;                            \
1289     }                                                 \
1290     else                                              \
1291         tmp = 1;                                      \
1292 )
1293
1294 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1295 REXEC_FBC_SCAN(                                       \
1296     if (CoNd) {                                       \
1297         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1298             goto got_it;                              \
1299         else                                          \
1300             tmp = doevery;                            \
1301     }                                                 \
1302     else                                              \
1303         tmp = 1;                                      \
1304 )
1305
1306 #define REXEC_FBC_TRYIT               \
1307 if ((!reginfo || regtry(reginfo, &s))) \
1308     goto got_it
1309
1310 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1311     if (utf8_target) {                                             \
1312         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1313     }                                                          \
1314     else {                                                     \
1315         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1316     }                                                          \
1317     break
1318
1319 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1320     if (utf8_target) {                                             \
1321         UtFpReLoAd;                                            \
1322         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1323     }                                                          \
1324     else {                                                     \
1325         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1326     }                                                          \
1327     break
1328
1329 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1330     PL_reg_flags |= RF_tainted;                                \
1331     if (utf8_target) {                                             \
1332         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1333     }                                                          \
1334     else {                                                     \
1335         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1336     }                                                          \
1337     break
1338
1339 #define DUMP_EXEC_POS(li,s,doutf8) \
1340     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1341
1342 /* We know what class REx starts with.  Try to find this position... */
1343 /* if reginfo is NULL, its a dryrun */
1344 /* annoyingly all the vars in this routine have different names from their counterparts
1345    in regmatch. /grrr */
1346
1347 STATIC char *
1348 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1349     const char *strend, regmatch_info *reginfo)
1350 {
1351         dVAR;
1352         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1353         char *m;
1354         STRLEN ln;
1355         STRLEN lnc;
1356         register STRLEN uskip;
1357         unsigned int c1;
1358         unsigned int c2;
1359         char *e;
1360         register I32 tmp = 1;   /* Scratch variable? */
1361         register const bool utf8_target = PL_reg_match_utf8;
1362         RXi_GET_DECL(prog,progi);
1363
1364         PERL_ARGS_ASSERT_FIND_BYCLASS;
1365
1366         /* We know what class it must start with. */
1367         switch (OP(c)) {
1368         case ANYOF:
1369             if (utf8_target) {
1370                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
1371                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1372                           reginclass(prog, c, (U8*)s, 0, utf8_target) :
1373                           REGINCLASS(prog, c, (U8*)s));
1374             }
1375             else {
1376                  while (s < strend) {
1377                       STRLEN skip = 1;
1378
1379                       if (REGINCLASS(prog, c, (U8*)s) ||
1380                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1381                            /* The assignment of 2 is intentional:
1382                             * for the folded sharp s, the skip is 2. */
1383                            (skip = SHARP_S_SKIP))) {
1384                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1385                                 goto got_it;
1386                            else
1387                                 tmp = doevery;
1388                       }
1389                       else
1390                            tmp = 1;
1391                       s += skip;
1392                  }
1393             }
1394             break;
1395         case CANY:
1396             REXEC_FBC_SCAN(
1397                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1398                     goto got_it;
1399                 else
1400                     tmp = doevery;
1401             );
1402             break;
1403         case EXACTFU:
1404         case EXACTF:
1405             m   = STRING(c);
1406             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1407             lnc = (I32) ln;     /* length to match in characters */
1408             if (UTF_PATTERN) {
1409                 STRLEN ulen1, ulen2;
1410                 U8 *sm = (U8 *) m;
1411                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1412                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1413                 /* used by commented-out code below */
1414                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1415
1416                 /* XXX: Since the node will be case folded at compile
1417                    time this logic is a little odd, although im not
1418                    sure that its actually wrong. --dmq */
1419
1420                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1421                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1422
1423                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1424                    codepoint of the first character in the converted
1425                    form, yet originally we did the extra step.
1426                    No tests fail by commenting this code out however
1427                    so Ive left it out. -- dmq.
1428
1429                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1430                                     0, uniflags);
1431                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1432                                     0, uniflags);
1433                 */
1434
1435                 lnc = 0;
1436                 while (sm < ((U8 *) m + ln)) {
1437                     lnc++;
1438                     sm += UTF8SKIP(sm);
1439                 }
1440             }
1441             else {
1442                 c1 = *(U8*)m;
1443                 if (utf8_target || OP(c) == EXACTFU) {
1444
1445                     /* Micro sign folds to GREEK SMALL LETTER MU;
1446                        LATIN_SMALL_LETTER_SHARP_S folds to 'ss', and this sets
1447                        c2 to the first 's' of the pair, and the code below will
1448                        look for others */
1449                     c2 = (c1 == MICRO_SIGN)
1450                         ? GREEK_SMALL_LETTER_MU
1451                         : (c1 == LATIN_SMALL_LETTER_SHARP_S)
1452                            ? 's'
1453                            : PL_fold_latin1[c1];
1454                 } else c2 = PL_fold[c1];
1455             }
1456             goto do_exactf;
1457         case EXACTFL:
1458             m   = STRING(c);
1459             ln  = STR_LEN(c);
1460             lnc = (I32) ln;
1461             c1 = *(U8*)m;
1462             c2 = PL_fold_locale[c1];
1463           do_exactf:
1464             e = HOP3c(strend, -((I32)lnc), s);
1465
1466             if (!reginfo && e < s)
1467                 e = s;                  /* Due to minlen logic of intuit() */
1468
1469             /* The idea in the EXACTF* cases is to first find the
1470              * first character of the EXACTF* node and then, if
1471              * necessary, case-insensitively compare the full
1472              * text of the node.  The c1 and c2 are the first
1473              * characters (though in Unicode it gets a bit
1474              * more complicated because there are more cases
1475              * than just upper and lower: one needs to use
1476              * the so-called folding case for case-insensitive
1477              * matching (called "loose matching" in Unicode).
1478              * foldEQ_utf8() will do just that. */
1479
1480             if (utf8_target || UTF_PATTERN) {
1481                 UV c, f;
1482                 U8 tmpbuf [UTF8_MAXBYTES+1];
1483                 STRLEN len = 1;
1484                 STRLEN foldlen;
1485                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1486                 if (c1 == c2) {
1487                     /* Upper and lower of 1st char are equal -
1488                      * probably not a "letter". */
1489                     while (s <= e) {
1490                         if (utf8_target) {
1491                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1492                                            uniflags);
1493                         } else {
1494                             c = *((U8*)s);
1495                         }
1496                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1497                     }
1498                 }
1499                 else {
1500                     while (s <= e) {
1501                         if (utf8_target) {
1502                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1503                                            uniflags);
1504                         } else {
1505                             c = *((U8*)s);
1506                         }
1507
1508                         /* Handle some of the three Greek sigmas cases.
1509                          * Note that not all the possible combinations
1510                          * are handled here: some of them are handled
1511                          * by the standard folding rules, and some of
1512                          * them (the character class or ANYOF cases)
1513                          * are handled during compiletime in
1514                          * regexec.c:S_regclass(). */
1515                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1516                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1517                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1518
1519                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1520                     }
1521                 }
1522             }
1523             else {
1524                 /* Neither pattern nor string are UTF8 */
1525                 if (c1 == c2)
1526                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1527                 else
1528                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1529             }
1530             break;
1531         case BOUNDL:
1532             PL_reg_flags |= RF_tainted;
1533             /* FALL THROUGH */
1534         case BOUND:
1535             if (utf8_target) {
1536                 if (s == PL_bostr)
1537                     tmp = '\n';
1538                 else {
1539                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1540                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1541                 }
1542                 tmp = ((OP(c) == BOUND ?
1543                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1544                 LOAD_UTF8_CHARCLASS_ALNUM();
1545                 REXEC_FBC_UTF8_SCAN(
1546                     if (tmp == !(OP(c) == BOUND ?
1547                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1548                                  isALNUM_LC_utf8((U8*)s)))
1549                     {
1550                         tmp = !tmp;
1551                         REXEC_FBC_TRYIT;
1552                 }
1553                 );
1554             }
1555             else {  /* Not utf8 */
1556                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1557                 tmp = cBOOL((OP(c) == BOUNDL)
1558                             ? isALNUM_LC(tmp)
1559                             : (isWORDCHAR_L1(tmp)
1560                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1561                 REXEC_FBC_SCAN(
1562                     if (tmp ==
1563                         !((OP(c) == BOUNDL)
1564                           ? isALNUM_LC(*s)
1565                           : (isWORDCHAR_L1((U8) *s)
1566                              && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1567                     {
1568                         tmp = !tmp;
1569                         REXEC_FBC_TRYIT;
1570                 }
1571                 );
1572             }
1573             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1574                 goto got_it;
1575             break;
1576         case NBOUNDL:
1577             PL_reg_flags |= RF_tainted;
1578             /* FALL THROUGH */
1579         case NBOUND:
1580             if (utf8_target) {
1581                 if (s == PL_bostr)
1582                     tmp = '\n';
1583                 else {
1584                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1585                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1586                 }
1587                 tmp = ((OP(c) == NBOUND ?
1588                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1589                 LOAD_UTF8_CHARCLASS_ALNUM();
1590                 REXEC_FBC_UTF8_SCAN(
1591                     if (tmp == !(OP(c) == NBOUND ?
1592                                  cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1593                                  isALNUM_LC_utf8((U8*)s)))
1594                         tmp = !tmp;
1595                     else REXEC_FBC_TRYIT;
1596                 );
1597             }
1598             else {
1599                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1600                 tmp = cBOOL((OP(c) == NBOUNDL)
1601                             ? isALNUM_LC(tmp)
1602                             : (isWORDCHAR_L1(tmp)
1603                                && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1604                 REXEC_FBC_SCAN(
1605                     if (tmp == ! cBOOL(
1606                             (OP(c) == NBOUNDL)
1607                             ? isALNUM_LC(*s)
1608                             : (isWORDCHAR_L1((U8) *s)
1609                                && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1610                     {
1611                         tmp = !tmp;
1612                     }
1613                     else REXEC_FBC_TRYIT;
1614                 );
1615             }
1616             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1617                 goto got_it;
1618             break;
1619         case ALNUM:
1620             REXEC_FBC_CSCAN_PRELOAD(
1621                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1622                 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1623                 (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
1624             );
1625         case ALNUML:
1626             REXEC_FBC_CSCAN_TAINT(
1627                 isALNUM_LC_utf8((U8*)s),
1628                 isALNUM_LC(*s)
1629             );
1630         case NALNUM:
1631             REXEC_FBC_CSCAN_PRELOAD(
1632                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1633                 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1634                 ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
1635             );
1636         case NALNUML:
1637             REXEC_FBC_CSCAN_TAINT(
1638                 !isALNUM_LC_utf8((U8*)s),
1639                 !isALNUM_LC(*s)
1640             );
1641         case SPACE:
1642             REXEC_FBC_CSCAN_PRELOAD(
1643                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1644                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1645                 isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
1646             );
1647         case SPACEL:
1648             REXEC_FBC_CSCAN_TAINT(
1649                 isSPACE_LC_utf8((U8*)s),
1650                 isSPACE_LC(*s)
1651             );
1652         case NSPACE:
1653             REXEC_FBC_CSCAN_PRELOAD(
1654                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1655                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1656                 !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
1657             );
1658         case NSPACEL:
1659             REXEC_FBC_CSCAN_TAINT(
1660                 !isSPACE_LC_utf8((U8*)s),
1661                 !isSPACE_LC(*s)
1662             );
1663         case DIGIT:
1664             REXEC_FBC_CSCAN_PRELOAD(
1665                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1666                 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1667                 isDIGIT(*s)
1668             );
1669         case DIGITL:
1670             REXEC_FBC_CSCAN_TAINT(
1671                 isDIGIT_LC_utf8((U8*)s),
1672                 isDIGIT_LC(*s)
1673             );
1674         case NDIGIT:
1675             REXEC_FBC_CSCAN_PRELOAD(
1676                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1677                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1678                 !isDIGIT(*s)
1679             );
1680         case NDIGITL:
1681             REXEC_FBC_CSCAN_TAINT(
1682                 !isDIGIT_LC_utf8((U8*)s),
1683                 !isDIGIT_LC(*s)
1684             );
1685         case LNBREAK:
1686             REXEC_FBC_CSCAN(
1687                 is_LNBREAK_utf8(s),
1688                 is_LNBREAK_latin1(s)
1689             );
1690         case VERTWS:
1691             REXEC_FBC_CSCAN(
1692                 is_VERTWS_utf8(s),
1693                 is_VERTWS_latin1(s)
1694             );
1695         case NVERTWS:
1696             REXEC_FBC_CSCAN(
1697                 !is_VERTWS_utf8(s),
1698                 !is_VERTWS_latin1(s)
1699             );
1700         case HORIZWS:
1701             REXEC_FBC_CSCAN(
1702                 is_HORIZWS_utf8(s),
1703                 is_HORIZWS_latin1(s)
1704             );
1705         case NHORIZWS:
1706             REXEC_FBC_CSCAN(
1707                 !is_HORIZWS_utf8(s),
1708                 !is_HORIZWS_latin1(s)
1709             );
1710         case AHOCORASICKC:
1711         case AHOCORASICK:
1712             {
1713                 DECL_TRIE_TYPE(c);
1714                 /* what trie are we using right now */
1715                 reg_ac_data *aho
1716                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1717                 reg_trie_data *trie
1718                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1719                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1720
1721                 const char *last_start = strend - trie->minlen;
1722 #ifdef DEBUGGING
1723                 const char *real_start = s;
1724 #endif
1725                 STRLEN maxlen = trie->maxlen;
1726                 SV *sv_points;
1727                 U8 **points; /* map of where we were in the input string
1728                                 when reading a given char. For ASCII this
1729                                 is unnecessary overhead as the relationship
1730                                 is always 1:1, but for Unicode, especially
1731                                 case folded Unicode this is not true. */
1732                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1733                 U8 *bitmap=NULL;
1734
1735
1736                 GET_RE_DEBUG_FLAGS_DECL;
1737
1738                 /* We can't just allocate points here. We need to wrap it in
1739                  * an SV so it gets freed properly if there is a croak while
1740                  * running the match */
1741                 ENTER;
1742                 SAVETMPS;
1743                 sv_points=newSV(maxlen * sizeof(U8 *));
1744                 SvCUR_set(sv_points,
1745                     maxlen * sizeof(U8 *));
1746                 SvPOK_on(sv_points);
1747                 sv_2mortal(sv_points);
1748                 points=(U8**)SvPV_nolen(sv_points );
1749                 if ( trie_type != trie_utf8_fold
1750                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1751                 {
1752                     if (trie->bitmap)
1753                         bitmap=(U8*)trie->bitmap;
1754                     else
1755                         bitmap=(U8*)ANYOF_BITMAP(c);
1756                 }
1757                 /* this is the Aho-Corasick algorithm modified a touch
1758                    to include special handling for long "unknown char"
1759                    sequences. The basic idea being that we use AC as long
1760                    as we are dealing with a possible matching char, when
1761                    we encounter an unknown char (and we have not encountered
1762                    an accepting state) we scan forward until we find a legal
1763                    starting char.
1764                    AC matching is basically that of trie matching, except
1765                    that when we encounter a failing transition, we fall back
1766                    to the current states "fail state", and try the current char
1767                    again, a process we repeat until we reach the root state,
1768                    state 1, or a legal transition. If we fail on the root state
1769                    then we can either terminate if we have reached an accepting
1770                    state previously, or restart the entire process from the beginning
1771                    if we have not.
1772
1773                  */
1774                 while (s <= last_start) {
1775                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1776                     U8 *uc = (U8*)s;
1777                     U16 charid = 0;
1778                     U32 base = 1;
1779                     U32 state = 1;
1780                     UV uvc = 0;
1781                     STRLEN len = 0;
1782                     STRLEN foldlen = 0;
1783                     U8 *uscan = (U8*)NULL;
1784                     U8 *leftmost = NULL;
1785 #ifdef DEBUGGING
1786                     U32 accepted_word= 0;
1787 #endif
1788                     U32 pointpos = 0;
1789
1790                     while ( state && uc <= (U8*)strend ) {
1791                         int failed=0;
1792                         U32 word = aho->states[ state ].wordnum;
1793
1794                         if( state==1 ) {
1795                             if ( bitmap ) {
1796                                 DEBUG_TRIE_EXECUTE_r(
1797                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1798                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1799                                             (char *)uc, utf8_target );
1800                                         PerlIO_printf( Perl_debug_log,
1801                                             " Scanning for legal start char...\n");
1802                                     }
1803                                 );
1804                                 if (utf8_target) {
1805                                     while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1806                                         uc += UTF8SKIP(uc);
1807                                     }
1808                                 } else {
1809                                     while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1810                                         uc++;
1811                                     }
1812                                 }
1813                                 s= (char *)uc;
1814                             }
1815                             if (uc >(U8*)last_start) break;
1816                         }
1817
1818                         if ( word ) {
1819                             U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1820                             if (!leftmost || lpos < leftmost) {
1821                                 DEBUG_r(accepted_word=word);
1822                                 leftmost= lpos;
1823                             }
1824                             if (base==0) break;
1825
1826                         }
1827                         points[pointpos++ % maxlen]= uc;
1828                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1829                                              uscan, len, uvc, charid, foldlen,
1830                                              foldbuf, uniflags);
1831                         DEBUG_TRIE_EXECUTE_r({
1832                             dump_exec_pos( (char *)uc, c, strend, real_start,
1833                                 s,   utf8_target );
1834                             PerlIO_printf(Perl_debug_log,
1835                                 " Charid:%3u CP:%4"UVxf" ",
1836                                  charid, uvc);
1837                         });
1838
1839                         do {
1840 #ifdef DEBUGGING
1841                             word = aho->states[ state ].wordnum;
1842 #endif
1843                             base = aho->states[ state ].trans.base;
1844
1845                             DEBUG_TRIE_EXECUTE_r({
1846                                 if (failed)
1847                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1848                                         s,   utf8_target );
1849                                 PerlIO_printf( Perl_debug_log,
1850                                     "%sState: %4"UVxf", word=%"UVxf,
1851                                     failed ? " Fail transition to " : "",
1852                                     (UV)state, (UV)word);
1853                             });
1854                             if ( base ) {
1855                                 U32 tmp;
1856                                 I32 offset;
1857                                 if (charid &&
1858                                      ( ((offset = base + charid
1859                                         - 1 - trie->uniquecharcount)) >= 0)
1860                                      && ((U32)offset < trie->lasttrans)
1861                                      && trie->trans[offset].check == state
1862                                      && (tmp=trie->trans[offset].next))
1863                                 {
1864                                     DEBUG_TRIE_EXECUTE_r(
1865                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1866                                     state = tmp;
1867                                     break;
1868                                 }
1869                                 else {
1870                                     DEBUG_TRIE_EXECUTE_r(
1871                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1872                                     failed = 1;
1873                                     state = aho->fail[state];
1874                                 }
1875                             }
1876                             else {
1877                                 /* we must be accepting here */
1878                                 DEBUG_TRIE_EXECUTE_r(
1879                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1880                                 failed = 1;
1881                                 break;
1882                             }
1883                         } while(state);
1884                         uc += len;
1885                         if (failed) {
1886                             if (leftmost)
1887                                 break;
1888                             if (!state) state = 1;
1889                         }
1890                     }
1891                     if ( aho->states[ state ].wordnum ) {
1892                         U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1893                         if (!leftmost || lpos < leftmost) {
1894                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1895                             leftmost = lpos;
1896                         }
1897                     }
1898                     if (leftmost) {
1899                         s = (char*)leftmost;
1900                         DEBUG_TRIE_EXECUTE_r({
1901                             PerlIO_printf(
1902                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1903                                 (UV)accepted_word, (IV)(s - real_start)
1904                             );
1905                         });
1906                         if (!reginfo || regtry(reginfo, &s)) {
1907                             FREETMPS;
1908                             LEAVE;
1909                             goto got_it;
1910                         }
1911                         s = HOPc(s,1);
1912                         DEBUG_TRIE_EXECUTE_r({
1913                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1914                         });
1915                     } else {
1916                         DEBUG_TRIE_EXECUTE_r(
1917                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1918                         break;
1919                     }
1920                 }
1921                 FREETMPS;
1922                 LEAVE;
1923             }
1924             break;
1925         default:
1926             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1927             break;
1928         }
1929         return 0;
1930       got_it:
1931         return s;
1932 }
1933
1934
1935 /*
1936  - regexec_flags - match a regexp against a string
1937  */
1938 I32
1939 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1940               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1941 /* strend: pointer to null at end of string */
1942 /* strbeg: real beginning of string */
1943 /* minend: end of match must be >=minend after stringarg. */
1944 /* data: May be used for some additional optimizations.
1945          Currently its only used, with a U32 cast, for transmitting
1946          the ganch offset when doing a /g match. This will change */
1947 /* nosave: For optimizations. */
1948 {
1949     dVAR;
1950     struct regexp *const prog = (struct regexp *)SvANY(rx);
1951     /*register*/ char *s;
1952     register regnode *c;
1953     /*register*/ char *startpos = stringarg;
1954     I32 minlen;         /* must match at least this many chars */
1955     I32 dontbother = 0; /* how many characters not to try at end */
1956     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1957     I32 scream_pos = -1;                /* Internal iterator of scream. */
1958     char *scream_olds = NULL;
1959     const bool utf8_target = cBOOL(DO_UTF8(sv));
1960     I32 multiline;
1961     RXi_GET_DECL(prog,progi);
1962     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1963     regexp_paren_pair *swap = NULL;
1964     GET_RE_DEBUG_FLAGS_DECL;
1965
1966     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1967     PERL_UNUSED_ARG(data);
1968
1969     /* Be paranoid... */
1970     if (prog == NULL || startpos == NULL) {
1971         Perl_croak(aTHX_ "NULL regexp parameter");
1972         return 0;
1973     }
1974
1975     multiline = prog->extflags & RXf_PMf_MULTILINE;
1976     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1977
1978     RX_MATCH_UTF8_set(rx, utf8_target);
1979     DEBUG_EXECUTE_r(
1980         debug_start_match(rx, utf8_target, startpos, strend,
1981         "Matching");
1982     );
1983
1984     minlen = prog->minlen;
1985
1986     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1987         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1988                               "String too short [regexec_flags]...\n"));
1989         goto phooey;
1990     }
1991
1992
1993     /* Check validity of program. */
1994     if (UCHARAT(progi->program) != REG_MAGIC) {
1995         Perl_croak(aTHX_ "corrupted regexp program");
1996     }
1997
1998     PL_reg_flags = 0;
1999     PL_reg_eval_set = 0;
2000     PL_reg_maxiter = 0;
2001
2002     if (RX_UTF8(rx))
2003         PL_reg_flags |= RF_utf8;
2004
2005     /* Mark beginning of line for ^ and lookbehind. */
2006     reginfo.bol = startpos; /* XXX not used ??? */
2007     PL_bostr  = strbeg;
2008     reginfo.sv = sv;
2009
2010     /* Mark end of line for $ (and such) */
2011     PL_regeol = strend;
2012
2013     /* see how far we have to get to not match where we matched before */
2014     reginfo.till = startpos+minend;
2015
2016     /* If there is a "must appear" string, look for it. */
2017     s = startpos;
2018
2019     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2020         MAGIC *mg;
2021         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
2022             reginfo.ganch = startpos + prog->gofs;
2023             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2024               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2025         } else if (sv && SvTYPE(sv) >= SVt_PVMG
2026                   && SvMAGIC(sv)
2027                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2028                   && mg->mg_len >= 0) {
2029             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
2030             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2031                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2032
2033             if (prog->extflags & RXf_ANCH_GPOS) {
2034                 if (s > reginfo.ganch)
2035                     goto phooey;
2036                 s = reginfo.ganch - prog->gofs;
2037                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2038                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2039                 if (s < strbeg)
2040                     goto phooey;
2041             }
2042         }
2043         else if (data) {
2044             reginfo.ganch = strbeg + PTR2UV(data);
2045             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2046                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2047
2048         } else {                                /* pos() not defined */
2049             reginfo.ganch = strbeg;
2050             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2051                  "GPOS: reginfo.ganch = strbeg\n"));
2052         }
2053     }
2054     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2055         /* We have to be careful. If the previous successful match
2056            was from this regex we don't want a subsequent partially
2057            successful match to clobber the old results.
2058            So when we detect this possibility we add a swap buffer
2059            to the re, and switch the buffer each match. If we fail
2060            we switch it back, otherwise we leave it swapped.
2061         */
2062         swap = prog->offs;
2063         /* do we need a save destructor here for eval dies? */
2064         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2065     }
2066     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2067         re_scream_pos_data d;
2068
2069         d.scream_olds = &scream_olds;
2070         d.scream_pos = &scream_pos;
2071         s = re_intuit_start(rx, sv, s, strend, flags, &d);
2072         if (!s) {
2073             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2074             goto phooey;        /* not present */
2075         }
2076     }
2077
2078
2079
2080     /* Simplest case:  anchored match need be tried only once. */
2081     /*  [unless only anchor is BOL and multiline is set] */
2082     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2083         if (s == startpos && regtry(&reginfo, &startpos))
2084             goto got_it;
2085         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2086                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2087         {
2088             char *end;
2089
2090             if (minlen)
2091                 dontbother = minlen - 1;
2092             end = HOP3c(strend, -dontbother, strbeg) - 1;
2093             /* for multiline we only have to try after newlines */
2094             if (prog->check_substr || prog->check_utf8) {
2095                 /* because of the goto we can not easily reuse the macros for bifurcating the
2096                    unicode/non-unicode match modes here like we do elsewhere - demerphq */
2097                 if (utf8_target) {
2098                     if (s == startpos)
2099                         goto after_try_utf8;
2100                     while (1) {
2101                         if (regtry(&reginfo, &s)) {
2102                             goto got_it;
2103                         }
2104                       after_try_utf8:
2105                         if (s > end) {
2106                             goto phooey;
2107                         }
2108                         if (prog->extflags & RXf_USE_INTUIT) {
2109                             s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2110                             if (!s) {
2111                                 goto phooey;
2112                             }
2113                         }
2114                         else {
2115                             s += UTF8SKIP(s);
2116                         }
2117                     }
2118                 } /* end search for check string in unicode */
2119                 else {
2120                     if (s == startpos) {
2121                         goto after_try_latin;
2122                     }
2123                     while (1) {
2124                         if (regtry(&reginfo, &s)) {
2125                             goto got_it;
2126                         }
2127                       after_try_latin:
2128                         if (s > end) {
2129                             goto phooey;
2130                         }
2131                         if (prog->extflags & RXf_USE_INTUIT) {
2132                             s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2133                             if (!s) {
2134                                 goto phooey;
2135                             }
2136                         }
2137                         else {
2138                             s++;
2139                         }
2140                     }
2141                 } /* end search for check string in latin*/
2142             } /* end search for check string */
2143             else { /* search for newline */
2144                 if (s > startpos) {
2145                     /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2146                     s--;
2147                 }
2148                 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2149                 while (s < end) {
2150                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2151                         if (regtry(&reginfo, &s))
2152                             goto got_it;
2153                     }
2154                 }
2155             } /* end search for newline */
2156         } /* end anchored/multiline check string search */
2157         goto phooey;
2158     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2159     {
2160         /* the warning about reginfo.ganch being used without intialization
2161            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2162            and we only enter this block when the same bit is set. */
2163         char *tmp_s = reginfo.ganch - prog->gofs;
2164
2165         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2166             goto got_it;
2167         goto phooey;
2168     }
2169
2170     /* Messy cases:  unanchored match. */
2171     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2172         /* we have /x+whatever/ */
2173         /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2174         char ch;
2175 #ifdef DEBUGGING
2176         int did_match = 0;
2177 #endif
2178         if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2179             utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2180         ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2181
2182         if (utf8_target) {
2183             REXEC_FBC_SCAN(
2184                 if (*s == ch) {
2185                     DEBUG_EXECUTE_r( did_match = 1 );
2186                     if (regtry(&reginfo, &s)) goto got_it;
2187                     s += UTF8SKIP(s);
2188                     while (s < strend && *s == ch)
2189                         s += UTF8SKIP(s);
2190                 }
2191             );
2192         }
2193         else {
2194             REXEC_FBC_SCAN(
2195                 if (*s == ch) {
2196                     DEBUG_EXECUTE_r( did_match = 1 );
2197                     if (regtry(&reginfo, &s)) goto got_it;
2198                     s++;
2199                     while (s < strend && *s == ch)
2200                         s++;
2201                 }
2202             );
2203         }
2204         DEBUG_EXECUTE_r(if (!did_match)
2205                 PerlIO_printf(Perl_debug_log,
2206                                   "Did not find anchored character...\n")
2207                );
2208     }
2209     else if (prog->anchored_substr != NULL
2210               || prog->anchored_utf8 != NULL
2211               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2212                   && prog->float_max_offset < strend - s)) {
2213         SV *must;
2214         I32 back_max;
2215         I32 back_min;
2216         char *last;
2217         char *last1;            /* Last position checked before */
2218 #ifdef DEBUGGING
2219         int did_match = 0;
2220 #endif
2221         if (prog->anchored_substr || prog->anchored_utf8) {
2222             if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2223                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2224             must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2225             back_max = back_min = prog->anchored_offset;
2226         } else {
2227             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2228                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2229             must = utf8_target ? prog->float_utf8 : prog->float_substr;
2230             back_max = prog->float_max_offset;
2231             back_min = prog->float_min_offset;
2232         }
2233
2234
2235         if (must == &PL_sv_undef)
2236             /* could not downgrade utf8 check substring, so must fail */
2237             goto phooey;
2238
2239         if (back_min<0) {
2240             last = strend;
2241         } else {
2242             last = HOP3c(strend,        /* Cannot start after this */
2243                   -(I32)(CHR_SVLEN(must)
2244                          - (SvTAIL(must) != 0) + back_min), strbeg);
2245         }
2246         if (s > PL_bostr)
2247             last1 = HOPc(s, -1);
2248         else
2249             last1 = s - 1;      /* bogus */
2250
2251         /* XXXX check_substr already used to find "s", can optimize if
2252            check_substr==must. */
2253         scream_pos = -1;
2254         dontbother = end_shift;
2255         strend = HOPc(strend, -dontbother);
2256         while ( (s <= last) &&
2257                 ((flags & REXEC_SCREAM)
2258                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2259                                     end_shift, &scream_pos, 0))
2260                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2261                                   (unsigned char*)strend, must,
2262                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2263             /* we may be pointing at the wrong string */
2264             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2265                 s = strbeg + (s - SvPVX_const(sv));
2266             DEBUG_EXECUTE_r( did_match = 1 );
2267             if (HOPc(s, -back_max) > last1) {
2268                 last1 = HOPc(s, -back_min);
2269                 s = HOPc(s, -back_max);
2270             }
2271             else {
2272                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2273
2274                 last1 = HOPc(s, -back_min);
2275                 s = t;
2276             }
2277             if (utf8_target) {
2278                 while (s <= last1) {
2279                     if (regtry(&reginfo, &s))
2280                         goto got_it;
2281                     s += UTF8SKIP(s);
2282                 }
2283             }
2284             else {
2285                 while (s <= last1) {
2286                     if (regtry(&reginfo, &s))
2287                         goto got_it;
2288                     s++;
2289                 }
2290             }
2291         }
2292         DEBUG_EXECUTE_r(if (!did_match) {
2293             RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2294                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2295             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2296                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2297                                ? "anchored" : "floating"),
2298                 quoted, RE_SV_TAIL(must));
2299         });
2300         goto phooey;
2301     }
2302     else if ( (c = progi->regstclass) ) {
2303         if (minlen) {
2304             const OPCODE op = OP(progi->regstclass);
2305             /* don't bother with what can't match */
2306             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2307                 strend = HOPc(strend, -(minlen - 1));
2308         }
2309         DEBUG_EXECUTE_r({
2310             SV * const prop = sv_newmortal();
2311             regprop(prog, prop, c);
2312             {
2313                 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2314                     s,strend-s,60);
2315                 PerlIO_printf(Perl_debug_log,
2316                     "Matching stclass %.*s against %s (%d bytes)\n",
2317                     (int)SvCUR(prop), SvPVX_const(prop),
2318                      quoted, (int)(strend - s));
2319             }
2320         });
2321         if (find_byclass(prog, c, s, strend, &reginfo))
2322             goto got_it;
2323         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2324     }
2325     else {
2326         dontbother = 0;
2327         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2328             /* Trim the end. */
2329             char *last;
2330             SV* float_real;
2331
2332             if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2333                 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2334             float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2335
2336             if (flags & REXEC_SCREAM) {
2337                 last = screaminstr(sv, float_real, s - strbeg,
2338                                    end_shift, &scream_pos, 1); /* last one */
2339                 if (!last)
2340                     last = scream_olds; /* Only one occurrence. */
2341                 /* we may be pointing at the wrong string */
2342                 else if (RXp_MATCH_COPIED(prog))
2343                     s = strbeg + (s - SvPVX_const(sv));
2344             }
2345             else {
2346                 STRLEN len;
2347                 const char * const little = SvPV_const(float_real, len);
2348
2349                 if (SvTAIL(float_real)) {
2350                     if (memEQ(strend - len + 1, little, len - 1))
2351                         last = strend - len + 1;
2352                     else if (!multiline)
2353                         last = memEQ(strend - len, little, len)
2354                             ? strend - len : NULL;
2355                     else
2356                         goto find_last;
2357                 } else {
2358                   find_last:
2359                     if (len)
2360                         last = rninstr(s, strend, little, little + len);
2361                     else
2362                         last = strend;  /* matching "$" */
2363                 }
2364             }
2365             if (last == NULL) {
2366                 DEBUG_EXECUTE_r(
2367                     PerlIO_printf(Perl_debug_log,
2368                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2369                         PL_colors[4], PL_colors[5]));
2370                 goto phooey; /* Should not happen! */
2371             }
2372             dontbother = strend - last + prog->float_min_offset;
2373         }
2374         if (minlen && (dontbother < minlen))
2375             dontbother = minlen - 1;
2376         strend -= dontbother;              /* this one's always in bytes! */
2377         /* We don't know much -- general case. */
2378         if (utf8_target) {
2379             for (;;) {
2380                 if (regtry(&reginfo, &s))
2381                     goto got_it;
2382                 if (s >= strend)
2383                     break;
2384                 s += UTF8SKIP(s);
2385             };
2386         }
2387         else {
2388             do {
2389                 if (regtry(&reginfo, &s))
2390                     goto got_it;
2391             } while (s++ < strend);
2392         }
2393     }
2394
2395     /* Failure. */
2396     goto phooey;
2397
2398 got_it:
2399     Safefree(swap);
2400     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2401
2402     if (PL_reg_eval_set)
2403         restore_pos(aTHX_ prog);
2404     if (RXp_PAREN_NAMES(prog))
2405         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2406
2407     /* make sure $`, $&, $', and $digit will work later */
2408     if ( !(flags & REXEC_NOT_FIRST) ) {
2409         RX_MATCH_COPY_FREE(rx);
2410         if (flags & REXEC_COPY_STR) {
2411             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2412 #ifdef PERL_OLD_COPY_ON_WRITE
2413             if ((SvIsCOW(sv)
2414                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2415                 if (DEBUG_C_TEST) {
2416                     PerlIO_printf(Perl_debug_log,
2417                                   "Copy on write: regexp capture, type %d\n",
2418                                   (int) SvTYPE(sv));
2419                 }
2420                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2421                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2422                 assert (SvPOKp(prog->saved_copy));
2423             } else
2424 #endif
2425             {
2426                 RX_MATCH_COPIED_on(rx);
2427                 s = savepvn(strbeg, i);
2428                 prog->subbeg = s;
2429             }
2430             prog->sublen = i;
2431         }
2432         else {
2433             prog->subbeg = strbeg;
2434             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2435         }
2436     }
2437
2438     return 1;
2439
2440 phooey:
2441     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2442                           PL_colors[4], PL_colors[5]));
2443     if (PL_reg_eval_set)
2444         restore_pos(aTHX_ prog);
2445     if (swap) {
2446         /* we failed :-( roll it back */
2447         Safefree(prog->offs);
2448         prog->offs = swap;
2449     }
2450
2451     return 0;
2452 }
2453
2454
2455 /*
2456  - regtry - try match at specific point
2457  */
2458 STATIC I32                      /* 0 failure, 1 success */
2459 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2460 {
2461     dVAR;
2462     CHECKPOINT lastcp;
2463     REGEXP *const rx = reginfo->prog;
2464     regexp *const prog = (struct regexp *)SvANY(rx);
2465     RXi_GET_DECL(prog,progi);
2466     GET_RE_DEBUG_FLAGS_DECL;
2467
2468     PERL_ARGS_ASSERT_REGTRY;
2469
2470     reginfo->cutpoint=NULL;
2471
2472     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2473         MAGIC *mg;
2474
2475         PL_reg_eval_set = RS_init;
2476         DEBUG_EXECUTE_r(DEBUG_s(
2477             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2478                           (IV)(PL_stack_sp - PL_stack_base));
2479             ));
2480         SAVESTACK_CXPOS();
2481         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2482         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2483         SAVETMPS;
2484         /* Apparently this is not needed, judging by wantarray. */
2485         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2486            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2487
2488         if (reginfo->sv) {
2489             /* Make $_ available to executed code. */
2490             if (reginfo->sv != DEFSV) {
2491                 SAVE_DEFSV;
2492                 DEFSV_set(reginfo->sv);
2493             }
2494
2495             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2496                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2497                 /* prepare for quick setting of pos */
2498 #ifdef PERL_OLD_COPY_ON_WRITE
2499                 if (SvIsCOW(reginfo->sv))
2500                     sv_force_normal_flags(reginfo->sv, 0);
2501 #endif
2502                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2503                                  &PL_vtbl_mglob, NULL, 0);
2504                 mg->mg_len = -1;
2505             }
2506             PL_reg_magic    = mg;
2507             PL_reg_oldpos   = mg->mg_len;
2508             SAVEDESTRUCTOR_X(restore_pos, prog);
2509         }
2510         if (!PL_reg_curpm) {
2511             Newxz(PL_reg_curpm, 1, PMOP);
2512 #ifdef USE_ITHREADS
2513             {
2514                 SV* const repointer = &PL_sv_undef;
2515                 /* this regexp is also owned by the new PL_reg_curpm, which
2516                    will try to free it.  */
2517                 av_push(PL_regex_padav, repointer);
2518                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2519                 PL_regex_pad = AvARRAY(PL_regex_padav);
2520             }
2521 #endif
2522         }
2523 #ifdef USE_ITHREADS
2524         /* It seems that non-ithreads works both with and without this code.
2525            So for efficiency reasons it seems best not to have the code
2526            compiled when it is not needed.  */
2527         /* This is safe against NULLs: */
2528         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2529         /* PM_reg_curpm owns a reference to this regexp.  */
2530         ReREFCNT_inc(rx);
2531 #endif
2532         PM_SETRE(PL_reg_curpm, rx);
2533         PL_reg_oldcurpm = PL_curpm;
2534         PL_curpm = PL_reg_curpm;
2535         if (RXp_MATCH_COPIED(prog)) {
2536             /*  Here is a serious problem: we cannot rewrite subbeg,
2537                 since it may be needed if this match fails.  Thus
2538                 $` inside (?{}) could fail... */
2539             PL_reg_oldsaved = prog->subbeg;
2540             PL_reg_oldsavedlen = prog->sublen;
2541 #ifdef PERL_OLD_COPY_ON_WRITE
2542             PL_nrs = prog->saved_copy;
2543 #endif
2544             RXp_MATCH_COPIED_off(prog);
2545         }
2546         else
2547             PL_reg_oldsaved = NULL;
2548         prog->subbeg = PL_bostr;
2549         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2550     }
2551     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2552     prog->offs[0].start = *startpos - PL_bostr;
2553     PL_reginput = *startpos;
2554     PL_reglastparen = &prog->lastparen;
2555     PL_reglastcloseparen = &prog->lastcloseparen;
2556     prog->lastparen = 0;
2557     prog->lastcloseparen = 0;
2558     PL_regsize = 0;
2559     PL_regoffs = prog->offs;
2560     if (PL_reg_start_tmpl <= prog->nparens) {
2561         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2562         if(PL_reg_start_tmp)
2563             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2564         else
2565             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2566     }
2567
2568     /* XXXX What this code is doing here?!!!  There should be no need
2569        to do this again and again, PL_reglastparen should take care of
2570        this!  --ilya*/
2571
2572     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2573      * Actually, the code in regcppop() (which Ilya may be meaning by
2574      * PL_reglastparen), is not needed at all by the test suite
2575      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2576      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2577      * Meanwhile, this code *is* needed for the
2578      * above-mentioned test suite tests to succeed.  The common theme
2579      * on those tests seems to be returning null fields from matches.
2580      * --jhi updated by dapm */
2581 #if 1
2582     if (prog->nparens) {
2583         regexp_paren_pair *pp = PL_regoffs;
2584         register I32 i;
2585         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2586             ++pp;
2587             pp->start = -1;
2588             pp->end = -1;
2589         }
2590     }
2591 #endif
2592     REGCP_SET(lastcp);
2593     if (regmatch(reginfo, progi->program + 1)) {
2594         PL_regoffs[0].end = PL_reginput - PL_bostr;
2595         return 1;
2596     }
2597     if (reginfo->cutpoint)
2598         *startpos= reginfo->cutpoint;
2599     REGCP_UNWIND(lastcp);
2600     return 0;
2601 }
2602
2603
2604 #define sayYES goto yes
2605 #define sayNO goto no
2606 #define sayNO_SILENT goto no_silent
2607
2608 /* we dont use STMT_START/END here because it leads to
2609    "unreachable code" warnings, which are bogus, but distracting. */
2610 #define CACHEsayNO \
2611     if (ST.cache_mask) \
2612        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2613     sayNO
2614
2615 /* this is used to determine how far from the left messages like
2616    'failed...' are printed. It should be set such that messages
2617    are inline with the regop output that created them.
2618 */
2619 #define REPORT_CODE_OFF 32
2620
2621
2622 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2623 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2624
2625 #define SLAB_FIRST(s) (&(s)->states[0])
2626 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2627
2628 /* grab a new slab and return the first slot in it */
2629
2630 STATIC regmatch_state *
2631 S_push_slab(pTHX)
2632 {
2633 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2634     dMY_CXT;
2635 #endif
2636     regmatch_slab *s = PL_regmatch_slab->next;
2637     if (!s) {
2638         Newx(s, 1, regmatch_slab);
2639         s->prev = PL_regmatch_slab;
2640         s->next = NULL;
2641         PL_regmatch_slab->next = s;
2642     }
2643     PL_regmatch_slab = s;
2644     return SLAB_FIRST(s);
2645 }
2646
2647
2648 /* push a new state then goto it */
2649
2650 #define PUSH_STATE_GOTO(state, node) \
2651     scan = node; \
2652     st->resume_state = state; \
2653     goto push_state;
2654
2655 /* push a new state with success backtracking, then goto it */
2656
2657 #define PUSH_YES_STATE_GOTO(state, node) \
2658     scan = node; \
2659     st->resume_state = state; \
2660     goto push_yes_state;
2661
2662
2663
2664 /*
2665
2666 regmatch() - main matching routine
2667
2668 This is basically one big switch statement in a loop. We execute an op,
2669 set 'next' to point the next op, and continue. If we come to a point which
2670 we may need to backtrack to on failure such as (A|B|C), we push a
2671 backtrack state onto the backtrack stack. On failure, we pop the top
2672 state, and re-enter the loop at the state indicated. If there are no more
2673 states to pop, we return failure.
2674
2675 Sometimes we also need to backtrack on success; for example /A+/, where
2676 after successfully matching one A, we need to go back and try to
2677 match another one; similarly for lookahead assertions: if the assertion
2678 completes successfully, we backtrack to the state just before the assertion
2679 and then carry on.  In these cases, the pushed state is marked as
2680 'backtrack on success too'. This marking is in fact done by a chain of
2681 pointers, each pointing to the previous 'yes' state. On success, we pop to
2682 the nearest yes state, discarding any intermediate failure-only states.
2683 Sometimes a yes state is pushed just to force some cleanup code to be
2684 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2685 it to free the inner regex.
2686
2687 Note that failure backtracking rewinds the cursor position, while
2688 success backtracking leaves it alone.
2689
2690 A pattern is complete when the END op is executed, while a subpattern
2691 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2692 ops trigger the "pop to last yes state if any, otherwise return true"
2693 behaviour.
2694
2695 A common convention in this function is to use A and B to refer to the two
2696 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2697 the subpattern to be matched possibly multiple times, while B is the entire
2698 rest of the pattern. Variable and state names reflect this convention.
2699
2700 The states in the main switch are the union of ops and failure/success of
2701 substates associated with with that op.  For example, IFMATCH is the op
2702 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2703 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2704 successfully matched A and IFMATCH_A_fail is a state saying that we have
2705 just failed to match A. Resume states always come in pairs. The backtrack
2706 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2707 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2708 on success or failure.
2709
2710 The struct that holds a backtracking state is actually a big union, with
2711 one variant for each major type of op. The variable st points to the
2712 top-most backtrack struct. To make the code clearer, within each
2713 block of code we #define ST to alias the relevant union.
2714
2715 Here's a concrete example of a (vastly oversimplified) IFMATCH
2716 implementation:
2717
2718     switch (state) {
2719     ....
2720
2721 #define ST st->u.ifmatch
2722
2723     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2724         ST.foo = ...; // some state we wish to save
2725         ...
2726         // push a yes backtrack state with a resume value of
2727         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2728         // first node of A:
2729         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2730         // NOTREACHED
2731
2732     case IFMATCH_A: // we have successfully executed A; now continue with B
2733         next = B;
2734         bar = ST.foo; // do something with the preserved value
2735         break;
2736
2737     case IFMATCH_A_fail: // A failed, so the assertion failed
2738         ...;   // do some housekeeping, then ...
2739         sayNO; // propagate the failure
2740
2741 #undef ST
2742
2743     ...
2744     }
2745
2746 For any old-timers reading this who are familiar with the old recursive
2747 approach, the code above is equivalent to:
2748
2749     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2750     {
2751         int foo = ...
2752         ...
2753         if (regmatch(A)) {
2754             next = B;
2755             bar = foo;
2756             break;
2757         }
2758         ...;   // do some housekeeping, then ...
2759         sayNO; // propagate the failure
2760     }
2761
2762 The topmost backtrack state, pointed to by st, is usually free. If you
2763 want to claim it, populate any ST.foo fields in it with values you wish to
2764 save, then do one of
2765
2766         PUSH_STATE_GOTO(resume_state, node);
2767         PUSH_YES_STATE_GOTO(resume_state, node);
2768
2769 which sets that backtrack state's resume value to 'resume_state', pushes a
2770 new free entry to the top of the backtrack stack, then goes to 'node'.
2771 On backtracking, the free slot is popped, and the saved state becomes the
2772 new free state. An ST.foo field in this new top state can be temporarily
2773 accessed to retrieve values, but once the main loop is re-entered, it
2774 becomes available for reuse.
2775
2776 Note that the depth of the backtrack stack constantly increases during the
2777 left-to-right execution of the pattern, rather than going up and down with
2778 the pattern nesting. For example the stack is at its maximum at Z at the
2779 end of the pattern, rather than at X in the following:
2780
2781     /(((X)+)+)+....(Y)+....Z/
2782
2783 The only exceptions to this are lookahead/behind assertions and the cut,
2784 (?>A), which pop all the backtrack states associated with A before
2785 continuing.
2786
2787 Bascktrack state structs are allocated in slabs of about 4K in size.
2788 PL_regmatch_state and st always point to the currently active state,
2789 and PL_regmatch_slab points to the slab currently containing
2790 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2791 allocated, and is never freed until interpreter destruction. When the slab
2792 is full, a new one is allocated and chained to the end. At exit from
2793 regmatch(), slabs allocated since entry are freed.
2794
2795 */
2796
2797
2798 #define DEBUG_STATE_pp(pp)                                  \
2799     DEBUG_STATE_r({                                         \
2800         DUMP_EXEC_POS(locinput, scan, utf8_target);                 \
2801         PerlIO_printf(Perl_debug_log,                       \
2802             "    %*s"pp" %s%s%s%s%s\n",                     \
2803             depth*2, "",                                    \
2804             PL_reg_name[st->resume_state],                     \
2805             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2806             ((st==yes_state) ? "Y" : ""),                   \
2807             ((st==mark_state) ? "M" : ""),                  \
2808             ((st==yes_state||st==mark_state) ? "]" : "")    \
2809         );                                                  \
2810     });
2811
2812
2813 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2814
2815 #ifdef DEBUGGING
2816
2817 STATIC void
2818 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2819     const char *start, const char *end, const char *blurb)
2820 {
2821     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2822
2823     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2824
2825     if (!PL_colorset)
2826             reginitcolors();
2827     {
2828         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2829             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2830
2831         RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2832             start, end - start, 60);
2833
2834         PerlIO_printf(Perl_debug_log,
2835             "%s%s REx%s %s against %s\n",
2836                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2837
2838         if (utf8_target||utf8_pat)
2839             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2840                 utf8_pat ? "pattern" : "",
2841                 utf8_pat && utf8_target ? " and " : "",
2842                 utf8_target ? "string" : ""
2843             );
2844     }
2845 }
2846
2847 STATIC void
2848 S_dump_exec_pos(pTHX_ const char *locinput,
2849                       const regnode *scan,
2850                       const char *loc_regeol,
2851                       const char *loc_bostr,
2852                       const char *loc_reg_starttry,
2853                       const bool utf8_target)
2854 {
2855     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2856     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2857     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2858     /* The part of the string before starttry has one color
2859        (pref0_len chars), between starttry and current
2860        position another one (pref_len - pref0_len chars),
2861        after the current position the third one.
2862        We assume that pref0_len <= pref_len, otherwise we
2863        decrease pref0_len.  */
2864     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2865         ? (5 + taill) - l : locinput - loc_bostr;
2866     int pref0_len;
2867
2868     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2869
2870     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2871         pref_len++;
2872     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2873     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2874         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2875               ? (5 + taill) - pref_len : loc_regeol - locinput);
2876     while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2877         l--;
2878     if (pref0_len < 0)
2879         pref0_len = 0;
2880     if (pref0_len > pref_len)
2881         pref0_len = pref_len;
2882     {
2883         const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2884
2885         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2886             (locinput - pref_len),pref0_len, 60, 4, 5);
2887
2888         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2889                     (locinput - pref_len + pref0_len),
2890                     pref_len - pref0_len, 60, 2, 3);
2891
2892         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2893                     locinput, loc_regeol - locinput, 10, 0, 1);
2894
2895         const STRLEN tlen=len0+len1+len2;
2896         PerlIO_printf(Perl_debug_log,
2897                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2898                     (IV)(locinput - loc_bostr),
2899                     len0, s0,
2900                     len1, s1,
2901                     (docolor ? "" : "> <"),
2902                     len2, s2,
2903                     (int)(tlen > 19 ? 0 :  19 - tlen),
2904                     "");
2905     }
2906 }
2907
2908 #endif
2909
2910 /* reg_check_named_buff_matched()
2911  * Checks to see if a named buffer has matched. The data array of
2912  * buffer numbers corresponding to the buffer is expected to reside
2913  * in the regexp->data->data array in the slot stored in the ARG() of
2914  * node involved. Note that this routine doesn't actually care about the
2915  * name, that information is not preserved from compilation to execution.
2916  * Returns the index of the leftmost defined buffer with the given name
2917  * or 0 if non of the buffers matched.
2918  */
2919 STATIC I32
2920 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2921 {
2922     I32 n;
2923     RXi_GET_DECL(rex,rexi);
2924     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2925     I32 *nums=(I32*)SvPVX(sv_dat);
2926
2927     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2928
2929     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2930         if ((I32)*PL_reglastparen >= nums[n] &&
2931             PL_regoffs[nums[n]].end != -1)
2932         {
2933             return nums[n];
2934         }
2935     }
2936     return 0;
2937 }
2938
2939
2940 /* free all slabs above current one  - called during LEAVE_SCOPE */
2941
2942 STATIC void
2943 S_clear_backtrack_stack(pTHX_ void *p)
2944 {
2945     regmatch_slab *s = PL_regmatch_slab->next;
2946     PERL_UNUSED_ARG(p);
2947
2948     if (!s)
2949         return;
2950     PL_regmatch_slab->next = NULL;
2951     while (s) {
2952         regmatch_slab * const osl = s;
2953         s = s->next;
2954         Safefree(osl);
2955     }
2956 }
2957
2958
2959 #define SETREX(Re1,Re2) \
2960     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2961     Re1 = (Re2)
2962
2963 STATIC I32                      /* 0 failure, 1 success */
2964 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2965 {
2966 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2967     dMY_CXT;
2968 #endif
2969     dVAR;
2970     register const bool utf8_target = PL_reg_match_utf8;
2971     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2972     REGEXP *rex_sv = reginfo->prog;
2973     regexp *rex = (struct regexp *)SvANY(rex_sv);
2974     RXi_GET_DECL(rex,rexi);
2975     I32 oldsave;
2976     /* the current state. This is a cached copy of PL_regmatch_state */
2977     register regmatch_state *st;
2978     /* cache heavy used fields of st in registers */
2979     register regnode *scan;
2980     register regnode *next;
2981     register U32 n = 0; /* general value; init to avoid compiler warning */
2982     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2983     register char *locinput = PL_reginput;
2984     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2985
2986     bool result = 0;        /* return value of S_regmatch */
2987     int depth = 0;          /* depth of backtrack stack */
2988     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2989     const U32 max_nochange_depth =
2990         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2991         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2992     regmatch_state *yes_state = NULL; /* state to pop to on success of
2993                                                             subpattern */
2994     /* mark_state piggy backs on the yes_state logic so that when we unwind
2995        the stack on success we can update the mark_state as we go */
2996     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2997     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2998     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2999     U32 state_num;
3000     bool no_final = 0;      /* prevent failure from backtracking? */
3001     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3002     char *startpoint = PL_reginput;
3003     SV *popmark = NULL;     /* are we looking for a mark? */
3004     SV *sv_commit = NULL;   /* last mark name seen in failure */
3005     SV *sv_yes_mark = NULL; /* last mark name we have seen
3006                                during a successfull match */
3007     U32 lastopen = 0;       /* last open we saw */
3008     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3009     SV* const oreplsv = GvSV(PL_replgv);
3010     /* these three flags are set by various ops to signal information to
3011      * the very next op. They have a useful lifetime of exactly one loop
3012      * iteration, and are not preserved or restored by state pushes/pops
3013      */
3014     bool sw = 0;            /* the condition value in (?(cond)a|b) */
3015     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
3016     int logical = 0;        /* the following EVAL is:
3017                                 0: (?{...})
3018                                 1: (?(?{...})X|Y)
3019                                 2: (??{...})
3020                                or the following IFMATCH/UNLESSM is:
3021                                 false: plain (?=foo)
3022                                 true:  used as a condition: (?(?=foo))
3023                             */
3024 #ifdef DEBUGGING
3025     GET_RE_DEBUG_FLAGS_DECL;
3026 #endif
3027
3028     PERL_ARGS_ASSERT_REGMATCH;
3029
3030     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3031             PerlIO_printf(Perl_debug_log,"regmatch start\n");
3032     }));
3033     /* on first ever call to regmatch, allocate first slab */
3034     if (!PL_regmatch_slab) {
3035         Newx(PL_regmatch_slab, 1, regmatch_slab);
3036         PL_regmatch_slab->prev = NULL;
3037         PL_regmatch_slab->next = NULL;
3038         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3039     }
3040
3041     oldsave = PL_savestack_ix;
3042     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3043     SAVEVPTR(PL_regmatch_slab);
3044     SAVEVPTR(PL_regmatch_state);
3045
3046     /* grab next free state slot */
3047     st = ++PL_regmatch_state;
3048     if (st >  SLAB_LAST(PL_regmatch_slab))
3049         st = PL_regmatch_state = S_push_slab(aTHX);
3050
3051     /* Note that nextchr is a byte even in UTF */
3052     nextchr = UCHARAT(locinput);
3053     scan = prog;
3054     while (scan != NULL) {
3055
3056         DEBUG_EXECUTE_r( {
3057             SV * const prop = sv_newmortal();
3058             regnode *rnext=regnext(scan);
3059             DUMP_EXEC_POS( locinput, scan, utf8_target );
3060             regprop(rex, prop, scan);
3061
3062             PerlIO_printf(Perl_debug_log,
3063                     "%3"IVdf":%*s%s(%"IVdf")\n",
3064                     (IV)(scan - rexi->program), depth*2, "",
3065                     SvPVX_const(prop),
3066                     (PL_regkind[OP(scan)] == END || !rnext) ?
3067                         0 : (IV)(rnext - rexi->program));
3068         });
3069
3070         next = scan + NEXT_OFF(scan);
3071         if (next == scan)
3072             next = NULL;
3073         state_num = OP(scan);
3074
3075         REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3076       reenter_switch:
3077
3078         assert(PL_reglastparen == &rex->lastparen);
3079         assert(PL_reglastcloseparen == &rex->lastcloseparen);
3080         assert(PL_regoffs == rex->offs);
3081
3082         switch (state_num) {
3083         case BOL:
3084             if (locinput == PL_bostr)
3085             {
3086                 /* reginfo->till = reginfo->bol; */
3087                 break;
3088             }
3089             sayNO;
3090         case MBOL:
3091             if (locinput == PL_bostr ||
3092                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3093             {
3094                 break;
3095             }
3096             sayNO;
3097         case SBOL:
3098             if (locinput == PL_bostr)
3099                 break;
3100             sayNO;
3101         case GPOS:
3102             if (locinput == reginfo->ganch)
3103                 break;
3104             sayNO;
3105
3106         case KEEPS:
3107             /* update the startpoint */
3108             st->u.keeper.val = PL_regoffs[0].start;
3109             PL_reginput = locinput;
3110             PL_regoffs[0].start = locinput - PL_bostr;
3111             PUSH_STATE_GOTO(KEEPS_next, next);
3112             /*NOT-REACHED*/
3113         case KEEPS_next_fail:
3114             /* rollback the start point change */
3115             PL_regoffs[0].start = st->u.keeper.val;
3116             sayNO_SILENT;
3117             /*NOT-REACHED*/
3118         case EOL:
3119                 goto seol;
3120         case MEOL:
3121             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3122                 sayNO;
3123             break;
3124         case SEOL:
3125           seol:
3126             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3127                 sayNO;
3128             if (PL_regeol - locinput > 1)
3129                 sayNO;
3130             break;
3131         case EOS:
3132             if (PL_regeol != locinput)
3133                 sayNO;
3134             break;
3135         case SANY:
3136             if (!nextchr && locinput >= PL_regeol)
3137                 sayNO;
3138             if (utf8_target) {
3139                 locinput += PL_utf8skip[nextchr];
3140                 if (locinput > PL_regeol)
3141                     sayNO;
3142                 nextchr = UCHARAT(locinput);
3143             }
3144             else
3145                 nextchr = UCHARAT(++locinput);
3146             break;
3147         case CANY:
3148             if (!nextchr && locinput >= PL_regeol)
3149                 sayNO;
3150             nextchr = UCHARAT(++locinput);
3151             break;
3152         case REG_ANY:
3153             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3154                 sayNO;
3155             if (utf8_target) {
3156                 locinput += PL_utf8skip[nextchr];
3157                 if (locinput > PL_regeol)
3158                     sayNO;
3159                 nextchr = UCHARAT(locinput);
3160             }
3161             else
3162                 nextchr = UCHARAT(++locinput);
3163             break;
3164
3165 #undef  ST
3166 #define ST st->u.trie
3167         case TRIEC:
3168             /* In this case the charclass data is available inline so
3169                we can fail fast without a lot of extra overhead.
3170              */
3171             if (scan->flags == EXACT || !utf8_target) {
3172                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3173                     DEBUG_EXECUTE_r(
3174                         PerlIO_printf(Perl_debug_log,
3175                                   "%*s  %sfailed to match trie start class...%s\n",
3176                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3177                     );
3178                     sayNO_SILENT;
3179                     /* NOTREACHED */
3180                 }
3181             }
3182             /* FALL THROUGH */
3183         case TRIE:
3184             /* the basic plan of execution of the trie is:
3185              * At the beginning, run though all the states, and
3186              * find the longest-matching word. Also remember the position
3187              * of the shortest matching word. For example, this pattern:
3188              *    1  2 3 4    5
3189              *    ab|a|x|abcd|abc
3190              * when matched against the string "abcde", will generate
3191              * accept states for all words except 3, with the longest
3192              * matching word being 4, and the shortest being 1 (with
3193              * the position being after char 1 of the string).
3194              *
3195              * Then for each matching word, in word order (i.e. 1,2,4,5),
3196              * we run the remainder of the pattern; on each try setting
3197              * the current position to the character following the word,
3198              * returning to try the next word on failure.
3199              *
3200              * We avoid having to build a list of words at runtime by
3201              * using a compile-time structure, wordinfo[].prev, which
3202              * gives, for each word, the previous accepting word (if any).
3203              * In the case above it would contain the mappings 1->2, 2->0,
3204              * 3->0, 4->5, 5->1.  We can use this table to generate, from
3205              * the longest word (4 above), a list of all words, by
3206              * following the list of prev pointers; this gives us the
3207              * unordered list 4,5,1,2. Then given the current word we have
3208              * just tried, we can go through the list and find the
3209              * next-biggest word to try (so if we just failed on word 2,
3210              * the next in the list is 4).
3211              *
3212              * Since at runtime we don't record the matching position in
3213              * the string for each word, we have to work that out for
3214              * each word we're about to process. The wordinfo table holds
3215              * the character length of each word; given that we recorded
3216              * at the start: the position of the shortest word and its
3217              * length in chars, we just need to move the pointer the
3218              * difference between the two char lengths. Depending on
3219              * Unicode status and folding, that's cheap or expensive.
3220              *
3221              * This algorithm is optimised for the case where are only a
3222              * small number of accept states, i.e. 0,1, or maybe 2.
3223              * With lots of accepts states, and having to try all of them,
3224              * it becomes quadratic on number of accept states to find all
3225              * the next words.
3226              */
3227
3228             {
3229                 /* what type of TRIE am I? (utf8 makes this contextual) */
3230                 DECL_TRIE_TYPE(scan);
3231
3232                 /* what trie are we using right now */
3233                 reg_trie_data * const trie
3234                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3235                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3236                 U32 state = trie->startstate;
3237
3238                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3239                     !TRIE_BITMAP_TEST(trie,*locinput)
3240                 ) {
3241                     if (trie->states[ state ].wordnum) {
3242                          DEBUG_EXECUTE_r(
3243                             PerlIO_printf(Perl_debug_log,
3244                                           "%*s  %smatched empty string...%s\n",
3245                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3246                         );
3247                         if (!trie->jump)
3248                             break;
3249                     } else {
3250                         DEBUG_EXECUTE_r(
3251                             PerlIO_printf(Perl_debug_log,
3252                                           "%*s  %sfailed to match trie start class...%s\n",
3253                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3254                         );
3255                         sayNO_SILENT;
3256                    }
3257                 }
3258
3259             {
3260                 U8 *uc = ( U8* )locinput;
3261
3262                 STRLEN len = 0;
3263                 STRLEN foldlen = 0;
3264                 U8 *uscan = (U8*)NULL;
3265                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3266                 U32 charcount = 0; /* how many input chars we have matched */
3267                 U32 accepted = 0; /* have we seen any accepting states? */
3268
3269                 ST.B = next;
3270                 ST.jump = trie->jump;
3271                 ST.me = scan;
3272                 ST.firstpos = NULL;
3273                 ST.longfold = FALSE; /* char longer if folded => it's harder */
3274                 ST.nextword = 0;
3275
3276                 /* fully traverse the TRIE; note the position of the
3277                    shortest accept state and the wordnum of the longest
3278                    accept state */
3279
3280                 while ( state && uc <= (U8*)PL_regeol ) {
3281                     U32 base = trie->states[ state ].trans.base;
3282                     UV uvc = 0;
3283                     U16 charid = 0;
3284                     U16 wordnum;
3285                     wordnum = trie->states[ state ].wordnum;
3286
3287                     if (wordnum) { /* it's an accept state */
3288                         if (!accepted) {
3289                             accepted = 1;
3290                             /* record first match position */
3291                             if (ST.longfold) {
3292                                 ST.firstpos = (U8*)locinput;
3293                                 ST.firstchars = 0;
3294                             }
3295                             else {
3296                                 ST.firstpos = uc;
3297                                 ST.firstchars = charcount;
3298                             }
3299                         }
3300                         if (!ST.nextword || wordnum < ST.nextword)
3301                             ST.nextword = wordnum;
3302                         ST.topword = wordnum;
3303                     }
3304
3305                     DEBUG_TRIE_EXECUTE_r({
3306                                 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3307                                 PerlIO_printf( Perl_debug_log,
3308                                     "%*s  %sState: %4"UVxf" Accepted: %c ",
3309                                     2+depth * 2, "", PL_colors[4],
3310                                     (UV)state, (accepted ? 'Y' : 'N'));
3311                     });
3312
3313                     /* read a char and goto next state */
3314                     if ( base ) {
3315                         I32 offset;
3316                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3317                                              uscan, len, uvc, charid, foldlen,
3318                                              foldbuf, uniflags);
3319                         charcount++;
3320                         if (foldlen>0)
3321                             ST.longfold = TRUE;
3322                         if (charid &&
3323                              ( ((offset =
3324                               base + charid - 1 - trie->uniquecharcount)) >= 0)
3325
3326                              && ((U32)offset < trie->lasttrans)
3327                              && trie->trans[offset].check == state)
3328                         {
3329                             state = trie->trans[offset].next;
3330                         }
3331                         else {
3332                             state = 0;
3333                         }
3334                         uc += len;
3335
3336                     }
3337                     else {
3338                         state = 0;
3339                     }
3340                     DEBUG_TRIE_EXECUTE_r(
3341                         PerlIO_printf( Perl_debug_log,
3342                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3343                             charid, uvc, (UV)state, PL_colors[5] );
3344                     );
3345                 }
3346                 if (!accepted)
3347                    sayNO;
3348
3349                 /* calculate total number of accept states */
3350                 {
3351                     U16 w = ST.topword;
3352                     accepted = 0;
3353                     while (w) {
3354                         w = trie->wordinfo[w].prev;
3355                         accepted++;
3356                     }
3357                     ST.accepted = accepted;
3358                 }
3359
3360                 DEBUG_EXECUTE_r(
3361                     PerlIO_printf( Perl_debug_log,
3362                         "%*s  %sgot %"IVdf" possible matches%s\n",
3363                         REPORT_CODE_OFF + depth * 2, "",
3364                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3365                 );
3366                 goto trie_first_try; /* jump into the fail handler */
3367             }}
3368             /* NOTREACHED */
3369
3370         case TRIE_next_fail: /* we failed - try next alternative */
3371             if ( ST.jump) {
3372                 REGCP_UNWIND(ST.cp);
3373                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3374                     PL_regoffs[n].end = -1;
3375                 *PL_reglastparen = n;
3376             }
3377             if (!--ST.accepted) {
3378                 DEBUG_EXECUTE_r({
3379                     PerlIO_printf( Perl_debug_log,
3380                         "%*s  %sTRIE failed...%s\n",
3381                         REPORT_CODE_OFF+depth*2, "",
3382                         PL_colors[4],
3383                         PL_colors[5] );
3384                 });
3385                 sayNO_SILENT;
3386             }
3387             {
3388                 /* Find next-highest word to process.  Note that this code
3389                  * is O(N^2) per trie run (O(N) per branch), so keep tight */
3390                 register U16 min = 0;
3391                 register U16 word;
3392                 register U16 const nextword = ST.nextword;
3393                 register reg_trie_wordinfo * const wordinfo
3394                     = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3395                 for (word=ST.topword; word; word=wordinfo[word].prev) {
3396                     if (word > nextword && (!min || word < min))
3397                         min = word;
3398                 }
3399                 ST.nextword = min;
3400             }
3401
3402           trie_first_try:
3403             if (do_cutgroup) {
3404                 do_cutgroup = 0;
3405                 no_final = 0;
3406             }
3407
3408             if ( ST.jump) {
3409                 ST.lastparen = *PL_reglastparen;
3410                 REGCP_SET(ST.cp);
3411             }
3412
3413             /* find start char of end of current word */
3414             {
3415                 U32 chars; /* how many chars to skip */
3416                 U8 *uc = ST.firstpos;
3417                 reg_trie_data * const trie
3418                     = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3419
3420                 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3421                             >=  ST.firstchars);
3422                 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3423                             - ST.firstchars;
3424
3425                 if (ST.longfold) {
3426                     /* the hard option - fold each char in turn and find
3427                      * its folded length (which may be different */
3428                     U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3429                     STRLEN foldlen;
3430                     STRLEN len;
3431                     UV uvc;
3432                     U8 *uscan;
3433
3434                     while (chars) {
3435                         if (utf8_target) {
3436                             uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3437                                                     uniflags);
3438                             uc += len;
3439                         }
3440                         else {
3441                             uvc = *uc;
3442                             uc++;
3443                         }
3444                         uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3445                         uscan = foldbuf;
3446                         while (foldlen) {
3447                             if (!--chars)
3448                                 break;
3449                             uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3450                                             uniflags);
3451                             uscan += len;
3452                             foldlen -= len;
3453                         }
3454                     }
3455                 }
3456                 else {
3457                     if (utf8_target)
3458                         while (chars--)
3459                             uc += UTF8SKIP(uc);
3460                     else
3461                         uc += chars;
3462                 }
3463                 PL_reginput = (char *)uc;
3464             }
3465
3466             scan = (ST.jump && ST.jump[ST.nextword])
3467                         ? ST.me + ST.jump[ST.nextword]
3468                         : ST.B;
3469
3470             DEBUG_EXECUTE_r({
3471                 PerlIO_printf( Perl_debug_log,
3472                     "%*s  %sTRIE matched word #%d, continuing%s\n",
3473                     REPORT_CODE_OFF+depth*2, "",
3474                     PL_colors[4],
3475                     ST.nextword,
3476                     PL_colors[5]
3477                     );
3478             });
3479
3480             if (ST.accepted > 1 || has_cutgroup) {
3481                 PUSH_STATE_GOTO(TRIE_next, scan);
3482                 /* NOTREACHED */
3483             }
3484             /* only one choice left - just continue */
3485             DEBUG_EXECUTE_r({
3486                 AV *const trie_words
3487                     = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3488                 SV ** const tmp = av_fetch( trie_words,
3489                     ST.nextword-1, 0 );
3490                 SV *sv= tmp ? sv_newmortal() : NULL;
3491
3492                 PerlIO_printf( Perl_debug_log,
3493                     "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3494                     REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3495                     ST.nextword,
3496                     tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3497                             PL_colors[0], PL_colors[1],
3498                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3499                         )
3500                     : "not compiled under -Dr",
3501                     PL_colors[5] );
3502             });
3503
3504             locinput = PL_reginput;
3505             nextchr = UCHARAT(locinput);
3506             continue; /* execute rest of RE */
3507             /* NOTREACHED */
3508 #undef  ST
3509
3510         case EXACT: {
3511             char *s = STRING(scan);
3512             ln = STR_LEN(scan);
3513             if (utf8_target != UTF_PATTERN) {
3514                 /* The target and the pattern have differing utf8ness. */
3515                 char *l = locinput;
3516                 const char * const e = s + ln;
3517
3518                 if (utf8_target) {
3519                     /* The target is utf8, the pattern is not utf8. */
3520                     while (s < e) {
3521                         STRLEN ulen;
3522                         if (l >= PL_regeol)
3523                              sayNO;
3524                         if (NATIVE_TO_UNI(*(U8*)s) !=
3525                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3526                                             uniflags))
3527                              sayNO;
3528                         l += ulen;
3529                         s ++;
3530                     }
3531                 }
3532                 else {
3533                     /* The target is not utf8, the pattern is utf8. */
3534                     while (s < e) {
3535                         STRLEN ulen;
3536                         if (l >= PL_regeol)
3537                             sayNO;
3538                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3539                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3540                                            uniflags))
3541                             sayNO;
3542                         s += ulen;
3543                         l ++;
3544                     }
3545                 }
3546                 locinput = l;
3547                 nextchr = UCHARAT(locinput);
3548                 break;
3549             }
3550             /* The target and the pattern have the same utf8ness. */
3551             /* Inline the first character, for speed. */
3552             if (UCHARAT(s) != nextchr)
3553                 sayNO;
3554             if (PL_regeol - locinput < ln)
3555                 sayNO;
3556             if (ln > 1 && memNE(s, locinput, ln))
3557                 sayNO;
3558             locinput += ln;
3559             nextchr = UCHARAT(locinput);
3560             break;
3561             }
3562         case EXACTFL: {
3563             re_fold_t folder;
3564             const U8 * fold_array;
3565             const char * s;
3566
3567             PL_reg_flags |= RF_tainted;
3568             folder = foldEQ_locale;
3569             fold_array = PL_fold_locale;
3570             goto do_exactf;
3571
3572         case EXACTFU:
3573             folder = foldEQ_latin1;
3574             fold_array = PL_fold_latin1;
3575             goto do_exactf;
3576
3577         case EXACTF:
3578             folder = foldEQ;
3579             fold_array = PL_fold;
3580
3581           do_exactf:
3582             s = STRING(scan);
3583             ln = STR_LEN(scan);
3584
3585             if (utf8_target || UTF_PATTERN) {
3586               /* Either target or the pattern are utf8. */
3587                 const char * const l = locinput;
3588                 char *e = PL_regeol;
3589
3590                 if (! foldEQ_utf8(s, 0,  ln, cBOOL(UTF_PATTERN),
3591                                l, &e, 0,  utf8_target)) {
3592                      /* One more case for the sharp s:
3593                       * pack("U0U*", 0xDF) =~ /ss/i,
3594                       * the 0xC3 0x9F are the UTF-8
3595                       * byte sequence for the U+00DF. */
3596
3597                      if (!(utf8_target &&
3598                            toLOWER(s[0]) == 's' &&
3599                            ln >= 2 &&
3600                            toLOWER(s[1]) == 's' &&
3601                            (U8)l[0] == 0xC3 &&
3602                            e - l >= 2 &&
3603                            (U8)l[1] == 0x9F))
3604                           sayNO;
3605                 }
3606                 locinput = e;
3607                 nextchr = UCHARAT(locinput);
3608                 break;
3609             }
3610
3611             /* Neither the target and the pattern are utf8. */
3612
3613             /* Inline the first character, for speed. */
3614             if (UCHARAT(s) != nextchr &&
3615                 UCHARAT(s) != fold_array[nextchr])
3616             {
3617                 sayNO;
3618             }
3619             if (PL_regeol - locinput < ln)
3620                 sayNO;
3621             if (ln > 1 && ! folder(s, locinput, ln))
3622                 sayNO;
3623             locinput += ln;
3624             nextchr = UCHARAT(locinput);
3625             break;
3626         }
3627         case BOUNDL:
3628         case NBOUNDL:
3629             PL_reg_flags |= RF_tainted;
3630             /* FALL THROUGH */
3631         case BOUND:
3632         case NBOUND:
3633             /* was last char in word? */
3634             if (utf8_target) {
3635                 if (locinput == PL_bostr)
3636                     ln = '\n';
3637                 else {
3638                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3639
3640                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3641                 }
3642                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3643                     ln = isALNUM_uni(ln);
3644                     LOAD_UTF8_CHARCLASS_ALNUM();
3645                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3646                 }
3647                 else {
3648                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3649                     n = isALNUM_LC_utf8((U8*)locinput);
3650                 }
3651             }
3652             else {
3653                 ln = (locinput != PL_bostr) ?
3654                     UCHARAT(locinput - 1) : '\n';
3655                 if (FLAGS(scan) & USE_UNI) {
3656
3657                     /* Here, can't be BOUNDL or NBOUNDL because they never set
3658                      * the flags to USE_UNI */
3659                     ln = isWORDCHAR_L1(ln);
3660                     n = isWORDCHAR_L1(nextchr);
3661                 }
3662                 else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3663                     ln = isALNUM(ln);
3664                     n = isALNUM(nextchr);
3665                 }
3666                 else {
3667                     ln = isALNUM_LC(ln);
3668                     n = isALNUM_LC(nextchr);
3669                 }
3670             }
3671             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3672                                     OP(scan) == BOUNDL))
3673                     sayNO;
3674             break;
3675         case ANYOF:
3676             if (utf8_target) {
3677                 STRLEN inclasslen = PL_regeol - locinput;
3678                 if (locinput >= PL_regeol)
3679                     sayNO;
3680
3681                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3682                     goto anyof_fail;
3683                 locinput += inclasslen;
3684                 nextchr = UCHARAT(locinput);
3685                 break;
3686             }
3687             else {
3688                 if (nextchr < 0)
3689                     nextchr = UCHARAT(locinput);
3690                 if (!nextchr && locinput >= PL_regeol)
3691                     sayNO;
3692                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3693                     goto anyof_fail;
3694                 nextchr = UCHARAT(++locinput);
3695                 break;
3696             }
3697         anyof_fail:
3698             /* If we might have the case of the German sharp s
3699              * in a casefolding Unicode character class. */
3700
3701             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3702                  locinput += SHARP_S_SKIP;
3703                  nextchr = UCHARAT(locinput);
3704             }
3705             else
3706                  sayNO;
3707             break;
3708         /* Special char classes - The defines start on line 129 or so */
3709         CCC_TRY_AFF_U( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3710         CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3711
3712         CCC_TRY_AFF_U( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3713         CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3714
3715         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3716         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3717
3718         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3719                        a Unicode extended Grapheme Cluster */
3720             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3721               extended Grapheme Cluster is:
3722
3723                CR LF
3724                | Prepend* Begin Extend*
3725                | .
3726
3727                Begin is (Hangul-syllable | ! Control)
3728                Extend is (Grapheme_Extend | Spacing_Mark)
3729                Control is [ GCB_Control CR LF ]
3730
3731                The discussion below shows how the code for CLUMP is derived
3732                from this regex.  Note that most of these concepts are from
3733                property values of the Grapheme Cluster Boundary (GCB) property.
3734                No code point can have multiple property values for a given
3735                property.  Thus a code point in Prepend can't be in Control, but
3736                it must be in !Control.  This is why Control above includes
3737                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3738                property separately, and so can't be in GCB_Control, even though
3739                they logically are controls.  Control is not the same as gc=cc,
3740                but includes format and other characters as well.
3741
3742                The Unicode definition of Hangul-syllable is:
3743                    L+
3744                    | (L* ( ( V | LV ) V* | LVT ) T*)
3745                    | T+
3746                   )
3747                Each of these is a value for the GCB property, and hence must be
3748                disjoint, so the order they are tested is immaterial, so the
3749                above can safely be changed to
3750                    T+
3751                    | L+
3752                    | (L* ( LVT | ( V | LV ) V*) T*)
3753
3754                The last two terms can be combined like this:
3755                    L* ( L
3756                         | (( LVT | ( V | LV ) V*) T*))
3757
3758                And refactored into this:
3759                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3760
3761                That means that if we have seen any L's at all we can quit
3762                there, but if the next character is a LVT, a V or and LV we
3763                should keep going.
3764
3765                There is a subtlety with Prepend* which showed up in testing.
3766                Note that the Begin, and only the Begin is required in:
3767                 | Prepend* Begin Extend*
3768                Also, Begin contains '! Control'.  A Prepend must be a '!
3769                Control', which means it must be a Begin.  What it comes down to
3770                is that if we match Prepend* and then find no suitable Begin
3771                afterwards, that if we backtrack the last Prepend, that one will
3772                be a suitable Begin.
3773             */
3774
3775             if (locinput >= PL_regeol)
3776                 sayNO;
3777             if  (! utf8_target) {
3778
3779                 /* Match either CR LF  or '.', as all the other possibilities
3780                  * require utf8 */
3781                 locinput++;         /* Match the . or CR */
3782                 if (nextchr == '\r'
3783                     && locinput < PL_regeol
3784                     && UCHARAT(locinput) == '\n') locinput++;
3785             }
3786             else {
3787
3788                 /* Utf8: See if is ( CR LF ); already know that locinput <
3789                  * PL_regeol, so locinput+1 is in bounds */
3790                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3791                     locinput += 2;
3792                 }
3793                 else {
3794                     /* In case have to backtrack to beginning, then match '.' */
3795                     char *starting = locinput;
3796
3797                     /* In case have to backtrack the last prepend */
3798                     char *previous_prepend = 0;
3799
3800                     LOAD_UTF8_CHARCLASS_GCB();
3801
3802                     /* Match (prepend)* */
3803                     while (locinput < PL_regeol
3804                            && swash_fetch(PL_utf8_X_prepend,
3805                                           (U8*)locinput, utf8_target))
3806                     {
3807                         previous_prepend = locinput;
3808                         locinput += UTF8SKIP(locinput);
3809                     }
3810
3811                     /* As noted above, if we matched a prepend character, but
3812                      * the next thing won't match, back off the last prepend we
3813                      * matched, as it is guaranteed to match the begin */
3814                     if (previous_prepend
3815                         && (locinput >=  PL_regeol
3816                             || ! swash_fetch(PL_utf8_X_begin,
3817                                              (U8*)locinput, utf8_target)))
3818                     {
3819                         locinput = previous_prepend;
3820                     }
3821
3822                     /* Note that here we know PL_regeol > locinput, as we
3823                      * tested that upon input to this switch case, and if we
3824                      * moved locinput forward, we tested the result just above
3825                      * and it either passed, or we backed off so that it will
3826                      * now pass */
3827                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3828
3829                         /* Here did not match the required 'Begin' in the
3830                          * second term.  So just match the very first
3831                          * character, the '.' of the final term of the regex */
3832                         locinput = starting + UTF8SKIP(starting);
3833                     } else {
3834
3835                         /* Here is the beginning of a character that can have
3836                          * an extender.  It is either a hangul syllable, or a
3837                          * non-control */
3838                         if (swash_fetch(PL_utf8_X_non_hangul,
3839                                         (U8*)locinput, utf8_target))
3840                         {
3841
3842                             /* Here not a Hangul syllable, must be a
3843                              * ('!  * Control') */
3844                             locinput += UTF8SKIP(locinput);
3845                         } else {
3846
3847                             /* Here is a Hangul syllable.  It can be composed
3848                              * of several individual characters.  One
3849                              * possibility is T+ */
3850                             if (swash_fetch(PL_utf8_X_T,
3851                                             (U8*)locinput, utf8_target))
3852                             {
3853                                 while (locinput < PL_regeol
3854                                         && swash_fetch(PL_utf8_X_T,
3855                                                         (U8*)locinput, utf8_target))
3856                                 {
3857                                     locinput += UTF8SKIP(locinput);
3858                                 }
3859                             } else {
3860
3861                                 /* Here, not T+, but is a Hangul.  That means
3862                                  * it is one of the others: L, LV, LVT or V,
3863                                  * and matches:
3864                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3865
3866                                 /* Match L*           */
3867                                 while (locinput < PL_regeol
3868                                         && swash_fetch(PL_utf8_X_L,
3869                                                         (U8*)locinput, utf8_target))
3870                                 {
3871                                     locinput += UTF8SKIP(locinput);
3872                                 }
3873
3874                                 /* Here, have exhausted L*.  If the next
3875                                  * character is not an LV, LVT nor V, it means
3876                                  * we had to have at least one L, so matches L+
3877                                  * in the original equation, we have a complete
3878                                  * hangul syllable.  Are done. */
3879
3880                                 if (locinput < PL_regeol
3881                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3882                                                     (U8*)locinput, utf8_target))
3883                                 {
3884
3885                                     /* Otherwise keep going.  Must be LV, LVT
3886                                      * or V.  See if LVT */
3887                                     if (swash_fetch(PL_utf8_X_LVT,
3888                                                     (U8*)locinput, utf8_target))
3889                                     {
3890                                         locinput += UTF8SKIP(locinput);
3891                                     } else {
3892
3893                                         /* Must be  V or LV.  Take it, then
3894                                          * match V*     */
3895                                         locinput += UTF8SKIP(locinput);
3896                                         while (locinput < PL_regeol
3897                                                 && swash_fetch(PL_utf8_X_V,
3898                                                          (U8*)locinput, utf8_target))
3899                                         {
3900                                             locinput += UTF8SKIP(locinput);
3901                                         }
3902                                     }
3903
3904                                     /* And any of LV, LVT, or V can be followed
3905                                      * by T*            */
3906                                     while (locinput < PL_regeol
3907                                            && swash_fetch(PL_utf8_X_T,
3908                                                            (U8*)locinput,
3909                                                            utf8_target))
3910                                     {
3911                                         locinput += UTF8SKIP(locinput);
3912                                     }
3913                                 }
3914                             }
3915                         }
3916
3917                         /* Match any extender */
3918                         while (locinput < PL_regeol
3919                                 && swash_fetch(PL_utf8_X_extend,
3920                                                 (U8*)locinput, utf8_target))
3921                         {
3922                             locinput += UTF8SKIP(locinput);
3923                         }
3924                     }
3925                 }
3926                 if (locinput > PL_regeol) sayNO;
3927             }
3928             nextchr = UCHARAT(locinput);
3929             break;
3930
3931         case NREFFL:
3932         {   /* The capture buffer cases.  The ones beginning with N for the
3933                named buffers just convert to the equivalent numbered and
3934                pretend they were called as the corresponding numbered buffer
3935                op.  */
3936             /* don't initialize these, it makes C++ unhappy */
3937             char *s;
3938             char type;
3939             re_fold_t folder;
3940             const U8 *fold_array;
3941
3942             folder = NULL;      /* NULL assumes will be NREF, REF: no
3943                                    folding */
3944             fold_array = NULL;
3945
3946             PL_reg_flags |= RF_tainted;
3947             folder = foldEQ_locale;
3948             fold_array = PL_fold_locale;
3949             type = REFFL;
3950             goto do_nref;
3951
3952         case NREFFU:
3953             folder = foldEQ_latin1;
3954             fold_array = PL_fold_latin1;
3955             type = REFFU;
3956             goto do_nref;
3957
3958         case NREFF:
3959             folder = foldEQ;
3960             fold_array = PL_fold;
3961             type = REFF;
3962             goto do_nref;
3963
3964         case NREF:
3965             type = REF;
3966             folder = NULL;
3967             fold_array = NULL;
3968           do_nref:
3969
3970             /* For the named back references, find the corresponding buffer
3971              * number */
3972             n = reg_check_named_buff_matched(rex,scan);
3973
3974             if ( ! n ) {
3975                 sayNO;
3976             }
3977             goto do_nref_ref_common;
3978
3979         case REFFL:
3980             PL_reg_flags |= RF_tainted;
3981             folder = foldEQ_locale;
3982             fold_array = PL_fold_locale;
3983             goto do_ref;
3984
3985         case REFFU:
3986             folder = foldEQ_latin1;
3987             fold_array = PL_fold_latin1;
3988             goto do_ref;
3989
3990         case REFF:
3991             folder = foldEQ;
3992             fold_array = PL_fold;
3993             goto do_ref;
3994
3995         case REF:
3996             folder = NULL;
3997             fold_array = NULL;
3998
3999           do_ref:
4000             type = OP(scan);
4001             n = ARG(scan);  /* which paren pair */
4002
4003           do_nref_ref_common:
4004             ln = PL_regoffs[n].start;
4005             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4006             if (*PL_reglastparen < n || ln == -1)
4007                 sayNO;                  /* Do not match unless seen CLOSEn. */
4008             if (ln == PL_regoffs[n].end)
4009                 break;
4010
4011             s = PL_bostr + ln;
4012             if (type != REF     /* REF can do byte comparison */
4013                 && (utf8_target
4014                     || (type == REFFU
4015                         && (*s == (char) LATIN_SMALL_LETTER_SHARP_S
4016                             || *locinput == (char) LATIN_SMALL_LETTER_SHARP_S))))
4017             { /* XXX handle REFFL better */
4018                 char * limit = PL_regeol;
4019
4020                 /* This call case insensitively compares the entire buffer
4021                     * at s, with the current input starting at locinput, but
4022                     * not going off the end given by PL_regeol, and returns in
4023                     * limit upon success, how much of the current input was
4024                     * matched */
4025                 if (! foldEQ_utf8(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4026                                     locinput, &limit, 0, utf8_target))
4027                 {
4028                     sayNO;
4029                 }
4030                 locinput = limit;
4031                 nextchr = UCHARAT(locinput);
4032                 break;
4033             }
4034
4035             /* Not utf8:  Inline the first character, for speed. */
4036             if (UCHARAT(s) != nextchr &&
4037                 (type == REF ||
4038                  UCHARAT(s) != fold_array[nextchr]))
4039                 sayNO;
4040             ln = PL_regoffs[n].end - ln;
4041             if (locinput + ln > PL_regeol)
4042                 sayNO;
4043             if (ln > 1 && (type == REF
4044                            ? memNE(s, locinput, ln)
4045                            : ! folder(s, locinput, ln)))
4046                 sayNO;
4047             locinput += ln;
4048             nextchr = UCHARAT(locinput);
4049             break;
4050         }
4051         case NOTHING:
4052         case TAIL:
4053             break;
4054         case BACK:
4055             break;
4056
4057 #undef  ST
4058 #define ST st->u.eval
4059         {
4060             SV *ret;
4061             REGEXP *re_sv;
4062             regexp *re;
4063             regexp_internal *rei;
4064             regnode *startpoint;
4065
4066         case GOSTART:
4067         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4068             if (cur_eval && cur_eval->locinput==locinput) {
4069                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4070                     Perl_croak(aTHX_ "Infinite recursion in regex");
4071                 if ( ++nochange_depth > max_nochange_depth )
4072                     Perl_croak(aTHX_
4073                         "Pattern subroutine nesting without pos change"
4074                         " exceeded limit in regex");
4075             } else {
4076                 nochange_depth = 0;
4077             }
4078             re_sv = rex_sv;
4079             re = rex;
4080             rei = rexi;
4081             (void)ReREFCNT_inc(rex_sv);
4082             if (OP(scan)==GOSUB) {
4083                 startpoint = scan + ARG2L(scan);
4084                 ST.close_paren = ARG(scan);
4085             } else {
4086                 startpoint = rei->program+1;
4087                 ST.close_paren = 0;
4088             }
4089             goto eval_recurse_doit;
4090             /* NOTREACHED */
4091         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4092             if (cur_eval && cur_eval->locinput==locinput) {
4093                 if ( ++nochange_depth > max_nochange_depth )
4094                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4095             } else {
4096                 nochange_depth = 0;
4097             }
4098             {
4099                 /* execute the code in the {...} */
4100                 dSP;
4101                 SV ** const before = SP;
4102                 OP_4tree * const oop = PL_op;
4103                 COP * const ocurcop = PL_curcop;
4104                 PAD *old_comppad;
4105                 char *saved_regeol = PL_regeol;
4106                 struct re_save_state saved_state;
4107
4108                 /* To not corrupt the existing regex state while executing the
4109                  * eval we would normally put it on the save stack, like with
4110                  * save_re_context. However, re-evals have a weird scoping so we
4111                  * can't just add ENTER/LEAVE here. With that, things like
4112                  *
4113                  *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4114                  *
4115                  * would break, as they expect the localisation to be unwound
4116                  * only when the re-engine backtracks through the bit that
4117                  * localised it.
4118                  *
4119                  * What we do instead is just saving the state in a local c
4120                  * variable.
4121                  */
4122                 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4123
4124                 n = ARG(scan);
4125                 PL_op = (OP_4tree*)rexi->data->data[n];
4126                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4127                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4128                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4129                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4130
4131                 if (sv_yes_mark) {
4132                     SV *sv_mrk = get_sv("REGMARK", 1);
4133                     sv_setsv(sv_mrk, sv_yes_mark);
4134                 }
4135
4136                 CALLRUNOPS(aTHX);                       /* Scalar context. */
4137                 SPAGAIN;
4138                 if (SP == before)
4139                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4140                 else {
4141                     ret = POPs;
4142                     PUTBACK;
4143                 }
4144
4145                 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4146
4147                 PL_op = oop;
4148                 PAD_RESTORE_LOCAL(old_comppad);
4149                 PL_curcop = ocurcop;
4150                 PL_regeol = saved_regeol;
4151                 if (!logical) {
4152                     /* /(?{...})/ */
4153                     sv_setsv(save_scalar(PL_replgv), ret);
4154                     break;
4155                 }
4156             }
4157             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4158                 logical = 0;
4159                 {
4160                     /* extract RE object from returned value; compiling if
4161                      * necessary */
4162                     MAGIC *mg = NULL;
4163                     REGEXP *rx = NULL;
4164
4165                     if (SvROK(ret)) {
4166                         SV *const sv = SvRV(ret);
4167
4168                         if (SvTYPE(sv) == SVt_REGEXP) {
4169                             rx = (REGEXP*) sv;
4170                         } else if (SvSMAGICAL(sv)) {
4171                             mg = mg_find(sv, PERL_MAGIC_qr);
4172                             assert(mg);
4173                         }
4174                     } else if (SvTYPE(ret) == SVt_REGEXP) {
4175                         rx = (REGEXP*) ret;
4176                     } else if (SvSMAGICAL(ret)) {
4177                         if (SvGMAGICAL(ret)) {
4178                             /* I don't believe that there is ever qr magic
4179                                here.  */
4180                             assert(!mg_find(ret, PERL_MAGIC_qr));
4181                             sv_unmagic(ret, PERL_MAGIC_qr);
4182                         }
4183                         else {
4184                             mg = mg_find(ret, PERL_MAGIC_qr);
4185                             /* testing suggests mg only ends up non-NULL for
4186                                scalars who were upgraded and compiled in the
4187                                else block below. In turn, this is only
4188                                triggered in the "postponed utf8 string" tests
4189                                in t/op/pat.t  */
4190                         }
4191                     }
4192
4193                     if (mg) {
4194                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4195                         assert(rx);
4196                     }
4197                     if (rx) {
4198                         rx = reg_temp_copy(NULL, rx);
4199                     }
4200                     else {
4201                         U32 pm_flags = 0;
4202                         const I32 osize = PL_regsize;
4203
4204                         if (DO_UTF8(ret)) {
4205                             assert (SvUTF8(ret));
4206                         } else if (SvUTF8(ret)) {
4207                             /* Not doing UTF-8, despite what the SV says. Is
4208                                this only if we're trapped in use 'bytes'?  */
4209                             /* Make a copy of the octet sequence, but without
4210                                the flag on, as the compiler now honours the
4211                                SvUTF8 flag on ret.  */
4212                             STRLEN len;
4213                             const char *const p = SvPV(ret, len);
4214                             ret = newSVpvn_flags(p, len, SVs_TEMP);
4215                         }
4216                         rx = CALLREGCOMP(ret, pm_flags);
4217                         if (!(SvFLAGS(ret)
4218                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4219                                  | SVs_GMG))) {
4220                             /* This isn't a first class regexp. Instead, it's
4221                                caching a regexp onto an existing, Perl visible
4222                                scalar.  */
4223                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4224                         }
4225                         PL_regsize = osize;
4226                     }
4227                     re_sv = rx;
4228                     re = (struct regexp *)SvANY(rx);
4229                 }
4230                 RXp_MATCH_COPIED_off(re);
4231                 re->subbeg = rex->subbeg;
4232                 re->sublen = rex->sublen;
4233                 rei = RXi_GET(re);
4234                 DEBUG_EXECUTE_r(
4235                     debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4236                         "Matching embedded");
4237                 );
4238                 startpoint = rei->program + 1;
4239                 ST.close_paren = 0; /* only used for GOSUB */
4240                 /* borrowed from regtry */
4241                 if (PL_reg_start_tmpl <= re->nparens) {
4242                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4243                     if(PL_reg_start_tmp)
4244                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4245                     else
4246                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4247                 }
4248
4249         eval_recurse_doit: /* Share code with GOSUB below this line */
4250                 /* run the pattern returned from (??{...}) */
4251                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4252                 REGCP_SET(ST.lastcp);
4253
4254                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4255
4256                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4257                 PL_reglastparen = &re->lastparen;
4258                 PL_reglastcloseparen = &re->lastcloseparen;
4259                 re->lastparen = 0;
4260                 re->lastcloseparen = 0;
4261
4262                 PL_reginput = locinput;
4263                 PL_regsize = 0;
4264
4265                 /* XXXX This is too dramatic a measure... */
4266                 PL_reg_maxiter = 0;
4267
4268                 ST.toggle_reg_flags = PL_reg_flags;
4269                 if (RX_UTF8(re_sv))
4270                     PL_reg_flags |= RF_utf8;
4271                 else
4272                     PL_reg_flags &= ~RF_utf8;
4273                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4274
4275                 ST.prev_rex = rex_sv;
4276                 ST.prev_curlyx = cur_curlyx;
4277                 SETREX(rex_sv,re_sv);
4278                 rex = re;
4279                 rexi = rei;
4280                 cur_curlyx = NULL;
4281                 ST.B = next;
4282                 ST.prev_eval = cur_eval;
4283                 cur_eval = st;
4284                 /* now continue from first node in postoned RE */
4285                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4286                 /* NOTREACHED */
4287             }
4288             /* logical is 1,   /(?(?{...})X|Y)/ */
4289             sw = cBOOL(SvTRUE(ret));
4290             logical = 0;
4291             break;
4292         }
4293
4294         case EVAL_AB: /* cleanup after a successful (??{A})B */
4295             /* note: this is called twice; first after popping B, then A */
4296             PL_reg_flags ^= ST.toggle_reg_flags;
4297             ReREFCNT_dec(rex_sv);
4298             SETREX(rex_sv,ST.prev_rex);
4299             rex = (struct regexp *)SvANY(rex_sv);
4300             rexi = RXi_GET(rex);
4301             regcpblow(ST.cp);
4302             cur_eval = ST.prev_eval;
4303             cur_curlyx = ST.prev_curlyx;
4304
4305             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4306             PL_reglastparen = &rex->lastparen;
4307             PL_reglastcloseparen = &rex->lastcloseparen;
4308             /* also update PL_regoffs */
4309             PL_regoffs = rex->offs;
4310
4311             /* XXXX This is too dramatic a measure... */
4312             PL_reg_maxiter = 0;
4313             if ( nochange_depth )
4314                 nochange_depth--;
4315             sayYES;
4316
4317
4318         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4319             /* note: this is called twice; first after popping B, then A */
4320             PL_reg_flags ^= ST.toggle_reg_flags;
4321             ReREFCNT_dec(rex_sv);
4322             SETREX(rex_sv,ST.prev_rex);
4323             rex = (struct regexp *)SvANY(rex_sv);
4324             rexi = RXi_GET(rex);
4325             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4326             PL_reglastparen = &rex->lastparen;
4327             PL_reglastcloseparen = &rex->lastcloseparen;
4328
4329             PL_reginput = locinput;
4330             REGCP_UNWIND(ST.lastcp);
4331             regcppop(rex);
4332             cur_eval = ST.prev_eval;
4333             cur_curlyx = ST.prev_curlyx;
4334             /* XXXX This is too dramatic a measure... */
4335             PL_reg_maxiter = 0;
4336             if ( nochange_depth )
4337                 nochange_depth--;
4338             sayNO_SILENT;
4339 #undef ST
4340
4341         case OPEN:
4342             n = ARG(scan);  /* which paren pair */
4343             PL_reg_start_tmp[n] = locinput;
4344             if (n > PL_regsize)
4345                 PL_regsize = n;
4346             lastopen = n;
4347             break;
4348         case CLOSE:
4349             n = ARG(scan);  /* which paren pair */
4350             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4351             PL_regoffs[n].end = locinput - PL_bostr;
4352             /*if (n > PL_regsize)
4353                 PL_regsize = n;*/
4354             if (n > *PL_reglastparen)
4355                 *PL_reglastparen = n;
4356             *PL_reglastcloseparen = n;
4357             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4358                 goto fake_end;
4359             }
4360             break;
4361         case ACCEPT:
4362             if (ARG(scan)){
4363                 regnode *cursor;
4364                 for (cursor=scan;
4365                      cursor && OP(cursor)!=END;
4366                      cursor=regnext(cursor))
4367                 {
4368                     if ( OP(cursor)==CLOSE ){
4369                         n = ARG(cursor);
4370                         if ( n <= lastopen ) {
4371                             PL_regoffs[n].start
4372                                 = PL_reg_start_tmp[n] - PL_bostr;
4373                             PL_regoffs[n].end = locinput - PL_bostr;
4374                             /*if (n > PL_regsize)
4375                             PL_regsize = n;*/
4376                             if (n > *PL_reglastparen)
4377                                 *PL_reglastparen = n;
4378                             *PL_reglastcloseparen = n;
4379                             if ( n == ARG(scan) || (cur_eval &&
4380                                 cur_eval->u.eval.close_paren == n))
4381                                 break;
4382                         }
4383                     }
4384                 }
4385             }
4386             goto fake_end;
4387             /*NOTREACHED*/
4388         case GROUPP:
4389             n = ARG(scan);  /* which paren pair */
4390             sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4391             break;
4392         case NGROUPP:
4393             /* reg_check_named_buff_matched returns 0 for no match */
4394             sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4395             break;
4396         case INSUBP:
4397             n = ARG(scan);
4398             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4399             break;
4400         case DEFINEP:
4401             sw = 0;
4402             break;
4403         case IFTHEN:
4404             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4405             if (sw)
4406                 next = NEXTOPER(NEXTOPER(scan));
4407             else {
4408                 next = scan + ARG(scan);
4409                 if (OP(next) == IFTHEN) /* Fake one. */
4410                     next = NEXTOPER(NEXTOPER(next));
4411             }
4412             break;
4413         case LOGICAL:
4414             logical = scan->flags;
4415             break;
4416
4417 /*******************************************************************
4418
4419 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4420 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4421 STAR/PLUS/CURLY/CURLYN are used instead.)
4422
4423 A*B is compiled as <CURLYX><A><WHILEM><B>
4424
4425 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4426 state, which contains the current count, initialised to -1. It also sets
4427 cur_curlyx to point to this state, with any previous value saved in the
4428 state block.
4429
4430 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4431 since the pattern may possibly match zero times (i.e. it's a while {} loop
4432 rather than a do {} while loop).
4433
4434 Each entry to WHILEM represents a successful match of A. The count in the
4435 CURLYX block is incremented, another WHILEM state is pushed, and execution
4436 passes to A or B depending on greediness and the current count.
4437
4438 For example, if matching against the string a1a2a3b (where the aN are
4439 substrings that match /A/), then the match progresses as follows: (the
4440 pushed states are interspersed with the bits of strings matched so far):
4441
4442     <CURLYX cnt=-1>
4443     <CURLYX cnt=0><WHILEM>
4444     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4445     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4446     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4447     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4448
4449 (Contrast this with something like CURLYM, which maintains only a single
4450 backtrack state:
4451
4452     <CURLYM cnt=0> a1
4453     a1 <CURLYM cnt=1> a2
4454     a1 a2 <CURLYM cnt=2> a3
4455     a1 a2 a3 <CURLYM cnt=3> b
4456 )
4457
4458 Each WHILEM state block marks a point to backtrack to upon partial failure
4459 of A or B, and also contains some minor state data related to that
4460 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4461 overall state, such as the count, and pointers to the A and B ops.
4462
4463 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4464 must always point to the *current* CURLYX block, the rules are:
4465
4466 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4467 and set cur_curlyx to point the new block.
4468
4469 When popping the CURLYX block after a successful or unsuccessful match,
4470 restore the previous cur_curlyx.
4471
4472 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4473 to the outer one saved in the CURLYX block.
4474
4475 When popping the WHILEM block after a successful or unsuccessful B match,
4476 restore the previous cur_curlyx.
4477
4478 Here's an example for the pattern (AI* BI)*BO
4479 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4480
4481 cur_
4482 curlyx backtrack stack
4483 ------ ---------------
4484 NULL
4485 CO     <CO prev=NULL> <WO>
4486 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4487 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4488 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4489
4490 At this point the pattern succeeds, and we work back down the stack to
4491 clean up, restoring as we go:
4492
4493 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4494 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4495 CO     <CO prev=NULL> <WO>
4496 NULL
4497
4498 *******************************************************************/
4499
4500 #define ST st->u.curlyx
4501
4502         case CURLYX:    /* start of /A*B/  (for complex A) */
4503         {
4504             /* No need to save/restore up to this paren */
4505             I32 parenfloor = scan->flags;
4506
4507             assert(next); /* keep Coverity happy */
4508             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4509                 next += ARG(next);
4510
4511             /* XXXX Probably it is better to teach regpush to support
4512                parenfloor > PL_regsize... */
4513             if (parenfloor > (I32)*PL_reglastparen)
4514                 parenfloor = *PL_reglastparen; /* Pessimization... */
4515
4516             ST.prev_curlyx= cur_curlyx;
4517             cur_curlyx = st;
4518             ST.cp = PL_savestack_ix;
4519
4520             /* these fields contain the state of the current curly.
4521              * they are accessed by subsequent WHILEMs */
4522             ST.parenfloor = parenfloor;
4523             ST.me = scan;
4524             ST.B = next;
4525             ST.minmod = minmod;
4526             minmod = 0;
4527             ST.count = -1;      /* this will be updated by WHILEM */
4528             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4529
4530             PL_reginput = locinput;
4531             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4532             /* NOTREACHED */
4533         }
4534
4535         case CURLYX_end: /* just finished matching all of A*B */
4536             cur_curlyx = ST.prev_curlyx;
4537             sayYES;
4538             /* NOTREACHED */
4539
4540         case CURLYX_end_fail: /* just failed to match all of A*B */
4541             regcpblow(ST.cp);
4542             cur_curlyx = ST.prev_curlyx;
4543             sayNO;
4544             /* NOTREACHED */
4545
4546
4547 #undef ST
4548 #define ST st->u.whilem
4549
4550         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4551         {
4552             /* see the discussion above about CURLYX/WHILEM */
4553             I32 n;
4554             int min = ARG1(cur_curlyx->u.curlyx.me);
4555             int max = ARG2(cur_curlyx->u.curlyx.me);
4556             regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4557
4558             assert(cur_curlyx); /* keep Coverity happy */
4559             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4560             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4561             ST.cache_offset = 0;
4562             ST.cache_mask = 0;
4563
4564             PL_reginput = locinput;
4565
4566             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4567                   "%*s  whilem: matched %ld out of %d..%d\n",
4568                   REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4569             );
4570
4571             /* First just match a string of min A's. */
4572
4573             if (n < min) {
4574                 cur_curlyx->u.curlyx.lastloc = locinput;
4575                 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4576                 /* NOTREACHED */
4577             }
4578
4579             /* If degenerate A matches "", assume A done. */
4580
4581             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4582                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4583                    "%*s  whilem: empty match detected, trying continuation...\n",
4584                    REPORT_CODE_OFF+depth*2, "")
4585                 );
4586                 goto do_whilem_B_max;
4587             }
4588
4589             /* super-linear cache processing */
4590
4591             if (scan->flags) {
4592
4593                 if (!PL_reg_maxiter) {
4594                     /* start the countdown: Postpone detection until we
4595                      * know the match is not *that* much linear. */
4596                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4597                     /* possible overflow for long strings and many CURLYX's */
4598                     if (PL_reg_maxiter < 0)
4599                         PL_reg_maxiter = I32_MAX;
4600                     PL_reg_leftiter = PL_reg_maxiter;
4601                 }
4602
4603                 if (PL_reg_leftiter-- == 0) {
4604                     /* initialise cache */
4605                     const I32 size = (PL_reg_maxiter + 7)/8;
4606                     if (PL_reg_poscache) {
4607                         if ((I32)PL_reg_poscache_size < size) {
4608                             Renew(PL_reg_poscache, size, char);
4609                             PL_reg_poscache_size = size;
4610                         }
4611                         Zero(PL_reg_poscache, size, char);
4612                     }
4613                     else {
4614                         PL_reg_poscache_size = size;
4615                         Newxz(PL_reg_poscache, size, char);
4616                     }
4617                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4618       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4619                               PL_colors[4], PL_colors[5])
4620                     );
4621                 }
4622
4623                 if (PL_reg_leftiter < 0) {
4624                     /* have we already failed at this position? */
4625                     I32 offset, mask;
4626                     offset  = (scan->flags & 0xf) - 1
4627                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4628                     mask    = 1 << (offset % 8);
4629                     offset /= 8;
4630                     if (PL_reg_poscache[offset] & mask) {
4631                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4632                             "%*s  whilem: (cache) already tried at this position...\n",
4633                             REPORT_CODE_OFF+depth*2, "")
4634                         );
4635                         sayNO; /* cache records failure */
4636                     }
4637                     ST.cache_offset = offset;
4638                     ST.cache_mask   = mask;
4639                 }
4640             }
4641
4642             /* Prefer B over A for minimal matching. */
4643
4644             if (cur_curlyx->u.curlyx.minmod) {
4645                 ST.save_curlyx = cur_curlyx;
4646                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4647                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4648                 REGCP_SET(ST.lastcp);
4649                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4650                 /* NOTREACHED */
4651             }
4652
4653             /* Prefer A over B for maximal matching. */
4654
4655             if (n < max) { /* More greed allowed? */
4656                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4657                 cur_curlyx->u.curlyx.lastloc = locinput;
4658                 REGCP_SET(ST.lastcp);
4659                 PUSH_STATE_GOTO(WHILEM_A_max, A);
4660                 /* NOTREACHED */
4661             }
4662             goto do_whilem_B_max;
4663         }
4664         /* NOTREACHED */
4665
4666         case WHILEM_B_min: /* just matched B in a minimal match */
4667         case WHILEM_B_max: /* just matched B in a maximal match */
4668             cur_curlyx = ST.save_curlyx;
4669             sayYES;
4670             /* NOTREACHED */
4671
4672         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4673             cur_curlyx = ST.save_curlyx;
4674             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4675             cur_curlyx->u.curlyx.count--;
4676             CACHEsayNO;
4677             /* NOTREACHED */
4678
4679         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4680             REGCP_UNWIND(ST.lastcp);
4681             regcppop(rex);
4682             /* FALL THROUGH */
4683         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4684             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4685             cur_curlyx->u.curlyx.count--;
4686             CACHEsayNO;
4687             /* NOTREACHED */
4688
4689         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4690             REGCP_UNWIND(ST.lastcp);
4691             regcppop(rex);      /* Restore some previous $<digit>s? */
4692             PL_reginput = locinput;
4693             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4694                 "%*s  whilem: failed, trying continuation...\n",
4695                 REPORT_CODE_OFF+depth*2, "")
4696             );
4697           do_whilem_B_max:
4698             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4699                 && ckWARN(WARN_REGEXP)
4700                 && !(PL_reg_flags & RF_warned))
4701             {
4702                 PL_reg_flags |= RF_warned;
4703                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4704                      "Complex regular subexpression recursion",
4705                      REG_INFTY - 1);
4706             }
4707
4708             /* now try B */
4709             ST.save_curlyx = cur_curlyx;
4710             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4711             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4712             /* NOTREACHED */
4713
4714         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4715             cur_curlyx = ST.save_curlyx;
4716             REGCP_UNWIND(ST.lastcp);
4717             regcppop(rex);
4718
4719             if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4720                 /* Maximum greed exceeded */
4721                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4722                     && ckWARN(WARN_REGEXP)
4723                     && !(PL_reg_flags & RF_warned))
4724                 {
4725                     PL_reg_flags |= RF_warned;
4726                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4727                         "%s limit (%d) exceeded",
4728                         "Complex regular subexpression recursion",
4729                         REG_INFTY - 1);
4730                 }
4731                 cur_curlyx->u.curlyx.count--;
4732                 CACHEsayNO;
4733             }
4734
4735             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4736                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4737             );
4738             /* Try grabbing another A and see if it helps. */
4739             PL_reginput = locinput;
4740             cur_curlyx->u.curlyx.lastloc = locinput;
4741             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4742             REGCP_SET(ST.lastcp);
4743             PUSH_STATE_GOTO(WHILEM_A_min,
4744                 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4745             /* NOTREACHED */
4746
4747 #undef  ST
4748 #define ST st->u.branch
4749
4750         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4751             next = scan + ARG(scan);
4752             if (next == scan)
4753                 next = NULL;
4754             scan = NEXTOPER(scan);
4755             /* FALL THROUGH */
4756
4757         case BRANCH:        /*  /(...|A|...)/ */
4758             scan = NEXTOPER(scan); /* scan now points to inner node */
4759             ST.lastparen = *PL_reglastparen;
4760             ST.next_branch = next;
4761             REGCP_SET(ST.cp);
4762             PL_reginput = locinput;
4763
4764             /* Now go into the branch */
4765             if (has_cutgroup) {
4766                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4767             } else {
4768                 PUSH_STATE_GOTO(BRANCH_next, scan);
4769             }
4770             /* NOTREACHED */
4771         case CUTGROUP:
4772             PL_reginput = locinput;
4773             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4774                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4775             PUSH_STATE_GOTO(CUTGROUP_next,next);
4776             /* NOTREACHED */
4777         case CUTGROUP_next_fail:
4778             do_cutgroup = 1;
4779             no_final = 1;
4780             if (st->u.mark.mark_name)
4781                 sv_commit = st->u.mark.mark_name;
4782             sayNO;
4783             /* NOTREACHED */
4784         case BRANCH_next:
4785             sayYES;
4786             /* NOTREACHED */
4787         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4788             if (do_cutgroup) {
4789                 do_cutgroup = 0;
4790                 no_final = 0;
4791             }
4792             REGCP_UNWIND(ST.cp);
4793             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4794                 PL_regoffs[n].end = -1;
4795             *PL_reglastparen = n;
4796             /*dmq: *PL_reglastcloseparen = n; */
4797             scan = ST.next_branch;
4798             /* no more branches? */
4799             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4800                 DEBUG_EXECUTE_r({
4801                     PerlIO_printf( Perl_debug_log,
4802                         "%*s  %sBRANCH failed...%s\n",
4803                         REPORT_CODE_OFF+depth*2, "",
4804                         PL_colors[4],
4805                         PL_colors[5] );
4806                 });
4807                 sayNO_SILENT;
4808             }
4809             continue; /* execute next BRANCH[J] op */
4810             /* NOTREACHED */
4811
4812         case MINMOD:
4813             minmod = 1;
4814             break;
4815
4816 #undef  ST
4817 #define ST st->u.curlym
4818
4819         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4820
4821             /* This is an optimisation of CURLYX that enables us to push
4822              * only a single backtracking state, no matter how many matches
4823              * there are in {m,n}. It relies on the pattern being constant
4824              * length, with no parens to influence future backrefs
4825              */
4826
4827             ST.me = scan;
4828             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4829
4830             /* if paren positive, emulate an OPEN/CLOSE around A */
4831             if (ST.me->flags) {
4832                 U32 paren = ST.me->flags;
4833                 if (paren > PL_regsize)
4834                     PL_regsize = paren;
4835                 if (paren > *PL_reglastparen)
4836                     *PL_reglastparen = paren;
4837                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4838             }
4839             ST.A = scan;
4840             ST.B = next;
4841             ST.alen = 0;
4842             ST.count = 0;
4843             ST.minmod = minmod;
4844             minmod = 0;
4845             ST.c1 = CHRTEST_UNINIT;
4846             REGCP_SET(ST.cp);
4847
4848             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4849                 goto curlym_do_B;
4850
4851           curlym_do_A: /* execute the A in /A{m,n}B/  */
4852             PL_reginput = locinput;
4853             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4854             /* NOTREACHED */
4855
4856         case CURLYM_A: /* we've just matched an A */
4857             locinput = st->locinput;
4858             nextchr = UCHARAT(locinput);
4859
4860             ST.count++;
4861             /* after first match, determine A's length: u.curlym.alen */
4862             if (ST.count == 1) {
4863                 if (PL_reg_match_utf8) {
4864                     char *s = locinput;
4865                     while (s < PL_reginput) {
4866                         ST.alen++;
4867                         s += UTF8SKIP(s);
4868                     }
4869                 }
4870                 else {
4871                     ST.alen = PL_reginput - locinput;
4872                 }
4873                 if (ST.alen == 0)
4874                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4875             }
4876             DEBUG_EXECUTE_r(
4877                 PerlIO_printf(Perl_debug_log,
4878                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4879                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4880                           (IV) ST.count, (IV)ST.alen)
4881             );
4882
4883             locinput = PL_reginput;
4884
4885             if (cur_eval && cur_eval->u.eval.close_paren &&
4886                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4887                 goto fake_end;
4888
4889             {
4890                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4891                 if ( max == REG_INFTY || ST.count < max )
4892                     goto curlym_do_A; /* try to match another A */
4893             }
4894             goto curlym_do_B; /* try to match B */
4895
4896         case CURLYM_A_fail: /* just failed to match an A */
4897             REGCP_UNWIND(ST.cp);
4898
4899             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4900                 || (cur_eval && cur_eval->u.eval.close_paren &&
4901                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4902                 sayNO;
4903
4904           curlym_do_B: /* execute the B in /A{m,n}B/  */
4905             PL_reginput = locinput;
4906             if (ST.c1 == CHRTEST_UNINIT) {
4907                 /* calculate c1 and c2 for possible match of 1st char
4908                  * following curly */
4909                 ST.c1 = ST.c2 = CHRTEST_VOID;
4910                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4911                     regnode *text_node = ST.B;
4912                     if (! HAS_TEXT(text_node))
4913                         FIND_NEXT_IMPT(text_node);
4914                     /* this used to be
4915
4916                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4917
4918                         But the former is redundant in light of the latter.
4919
4920                         if this changes back then the macro for
4921                         IS_TEXT and friends need to change.
4922                      */
4923                     if (PL_regkind[OP(text_node)] == EXACT)
4924                     {
4925
4926                         ST.c1 = (U8)*STRING(text_node);
4927                         switch (OP(text_node)) {
4928                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
4929                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
4930                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
4931                             default: ST.c2 = ST.c1;
4932                         }
4933                     }
4934                 }
4935             }
4936
4937             DEBUG_EXECUTE_r(
4938                 PerlIO_printf(Perl_debug_log,
4939                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4940                     (int)(REPORT_CODE_OFF+(depth*2)),
4941                     "", (IV)ST.count)
4942                 );
4943             if (ST.c1 != CHRTEST_VOID
4944                     && UCHARAT(PL_reginput) != ST.c1
4945                     && UCHARAT(PL_reginput) != ST.c2)
4946             {
4947                 /* simulate B failing */
4948                 DEBUG_OPTIMISE_r(
4949                     PerlIO_printf(Perl_debug_log,
4950                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4951                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4952                         (IV)ST.c1,(IV)ST.c2
4953                 ));
4954                 state_num = CURLYM_B_fail;
4955                 goto reenter_switch;
4956             }
4957
4958             if (ST.me->flags) {
4959                 /* mark current A as captured */
4960                 I32 paren = ST.me->flags;
4961                 if (ST.count) {
4962                     PL_regoffs[paren].start
4963                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4964                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4965                     /*dmq: *PL_reglastcloseparen = paren; */
4966                 }
4967                 else
4968                     PL_regoffs[paren].end = -1;
4969                 if (cur_eval && cur_eval->u.eval.close_paren &&
4970                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4971                 {
4972                     if (ST.count)
4973                         goto fake_end;
4974                     else
4975                         sayNO;
4976                 }
4977             }
4978
4979             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4980             /* NOTREACHED */
4981
4982         case CURLYM_B_fail: /* just failed to match a B */
4983             REGCP_UNWIND(ST.cp);
4984             if (ST.minmod) {
4985                 I32 max = ARG2(ST.me);
4986                 if (max != REG_INFTY && ST.count == max)
4987                     sayNO;
4988                 goto curlym_do_A; /* try to match a further A */
4989             }
4990             /* backtrack one A */
4991             if (ST.count == ARG1(ST.me) /* min */)
4992                 sayNO;
4993             ST.count--;
4994             locinput = HOPc(locinput, -ST.alen);
4995             goto curlym_do_B; /* try to match B */
4996
4997 #undef ST
4998 #define ST st->u.curly
4999
5000 #define CURLY_SETPAREN(paren, success) \
5001     if (paren) { \
5002         if (success) { \
5003             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5004             PL_regoffs[paren].end = locinput - PL_bostr; \
5005             *PL_reglastcloseparen = paren; \
5006         } \
5007         else \
5008             PL_regoffs[paren].end = -1; \
5009     }
5010
5011         case STAR:              /*  /A*B/ where A is width 1 */
5012             ST.paren = 0;
5013             ST.min = 0;
5014             ST.max = REG_INFTY;
5015             scan = NEXTOPER(scan);
5016             goto repeat;
5017         case PLUS:              /*  /A+B/ where A is width 1 */
5018             ST.paren = 0;
5019             ST.min = 1;
5020             ST.max = REG_INFTY;
5021             scan = NEXTOPER(scan);
5022             goto repeat;
5023         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
5024             ST.paren = scan->flags;     /* Which paren to set */
5025             if (ST.paren > PL_regsize)
5026                 PL_regsize = ST.paren;
5027             if (ST.paren > *PL_reglastparen)
5028                 *PL_reglastparen = ST.paren;
5029             ST.min = ARG1(scan);  /* min to match */
5030             ST.max = ARG2(scan);  /* max to match */
5031             if (cur_eval && cur_eval->u.eval.close_paren &&
5032                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5033                 ST.min=1;
5034                 ST.max=1;
5035             }
5036             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5037             goto repeat;
5038         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
5039             ST.paren = 0;
5040             ST.min = ARG1(scan);  /* min to match */
5041             ST.max = ARG2(scan);  /* max to match */
5042             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5043           repeat:
5044             /*
5045             * Lookahead to avoid useless match attempts
5046             * when we know what character comes next.
5047             *
5048             * Used to only do .*x and .*?x, but now it allows
5049             * for )'s, ('s and (?{ ... })'s to be in the way
5050             * of the quantifier and the EXACT-like node.  -- japhy
5051             */
5052
5053             if (ST.min > ST.max) /* XXX make this a compile-time check? */
5054                 sayNO;
5055             if (HAS_TEXT(next) || JUMPABLE(next)) {
5056                 U8 *s;
5057                 regnode *text_node = next;
5058
5059                 if (! HAS_TEXT(text_node))
5060                     FIND_NEXT_IMPT(text_node);
5061
5062                 if (! HAS_TEXT(text_node))
5063                     ST.c1 = ST.c2 = CHRTEST_VOID;
5064                 else {
5065                     if ( PL_regkind[OP(text_node)] != EXACT ) {
5066                         ST.c1 = ST.c2 = CHRTEST_VOID;
5067                         goto assume_ok_easy;
5068                     }
5069                     else
5070                         s = (U8*)STRING(text_node);
5071
5072                     /*  Currently we only get here when
5073
5074                         PL_rekind[OP(text_node)] == EXACT
5075
5076                         if this changes back then the macro for IS_TEXT and
5077                         friends need to change. */
5078                     if (!UTF_PATTERN) {
5079                         ST.c1 = *s;
5080                         switch (OP(text_node)) {
5081                             case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5082                             case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5083                             case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5084                             default: ST.c2 = ST.c1; break;
5085                         }
5086                     }
5087                     else { /* UTF_PATTERN */
5088                         if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5089                              STRLEN ulen1, ulen2;
5090                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5091                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5092
5093                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5094                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5095 #ifdef EBCDIC
5096                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5097                                                     ckWARN(WARN_UTF8) ?
5098                                                     0 : UTF8_ALLOW_ANY);
5099                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5100                                                     ckWARN(WARN_UTF8) ?
5101                                                     0 : UTF8_ALLOW_ANY);
5102 #else
5103                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5104                                                     uniflags);
5105                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5106                                                     uniflags);
5107 #endif
5108                         }
5109                         else {
5110                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5111                                                      uniflags);
5112                         }
5113                     }
5114                 }
5115             }
5116             else
5117                 ST.c1 = ST.c2 = CHRTEST_VOID;
5118         assume_ok_easy:
5119
5120             ST.A = scan;
5121             ST.B = next;
5122             PL_reginput = locinput;
5123             if (minmod) {
5124                 minmod = 0;
5125                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5126                     sayNO;
5127                 ST.count = ST.min;
5128                 locinput = PL_reginput;
5129                 REGCP_SET(ST.cp);
5130                 if (ST.c1 == CHRTEST_VOID)
5131                     goto curly_try_B_min;
5132
5133                 ST.oldloc = locinput;
5134
5135                 /* set ST.maxpos to the furthest point along the
5136                  * string that could possibly match */
5137                 if  (ST.max == REG_INFTY) {
5138                     ST.maxpos = PL_regeol - 1;
5139                     if (utf8_target)
5140                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5141                             ST.maxpos--;
5142                 }
5143                 else if (utf8_target) {
5144                     int m = ST.max - ST.min;
5145                     for (ST.maxpos = locinput;
5146                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5147                         ST.maxpos += UTF8SKIP(ST.maxpos);
5148                 }
5149                 else {
5150                     ST.maxpos = locinput + ST.max - ST.min;
5151                     if (ST.maxpos >= PL_regeol)
5152                         ST.maxpos = PL_regeol - 1;
5153                 }
5154                 goto curly_try_B_min_known;
5155
5156             }
5157             else {
5158                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5159                 locinput = PL_reginput;
5160                 if (ST.count < ST.min)
5161                     sayNO;
5162                 if ((ST.count > ST.min)
5163                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5164                 {
5165                     /* A{m,n} must come at the end of the string, there's
5166                      * no point in backing off ... */
5167                     ST.min = ST.count;
5168                     /* ...except that $ and \Z can match before *and* after
5169                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5170                        We may back off by one in this case. */
5171                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5172                         ST.min--;
5173                 }
5174                 REGCP_SET(ST.cp);
5175                 goto curly_try_B_max;
5176             }
5177             /* NOTREACHED */
5178
5179
5180         case CURLY_B_min_known_fail:
5181             /* failed to find B in a non-greedy match where c1,c2 valid */
5182             if (ST.paren && ST.count)
5183                 PL_regoffs[ST.paren].end = -1;
5184
5185             PL_reginput = locinput;     /* Could be reset... */
5186             REGCP_UNWIND(ST.cp);
5187             /* Couldn't or didn't -- move forward. */
5188             ST.oldloc = locinput;
5189             if (utf8_target)
5190                 locinput += UTF8SKIP(locinput);
5191             else
5192                 locinput++;
5193             ST.count++;
5194           curly_try_B_min_known:
5195              /* find the next place where 'B' could work, then call B */
5196             {
5197                 int n;
5198                 if (utf8_target) {
5199                     n = (ST.oldloc == locinput) ? 0 : 1;
5200                     if (ST.c1 == ST.c2) {
5201                         STRLEN len;
5202                         /* set n to utf8_distance(oldloc, locinput) */
5203                         while (locinput <= ST.maxpos &&
5204                                utf8n_to_uvchr((U8*)locinput,
5205                                               UTF8_MAXBYTES, &len,
5206                                               uniflags) != (UV)ST.c1) {
5207                             locinput += len;
5208                             n++;
5209                         }
5210                     }
5211                     else {
5212                         /* set n to utf8_distance(oldloc, locinput) */
5213                         while (locinput <= ST.maxpos) {
5214                             STRLEN len;
5215                             const UV c = utf8n_to_uvchr((U8*)locinput,
5216                                                   UTF8_MAXBYTES, &len,
5217                                                   uniflags);
5218                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
5219                                 break;
5220                             locinput += len;
5221                             n++;
5222                         }
5223                     }
5224                 }
5225                 else {
5226                     if (ST.c1 == ST.c2) {
5227                         while (locinput <= ST.maxpos &&
5228                                UCHARAT(locinput) != ST.c1)
5229                             locinput++;
5230                     }
5231                     else {
5232                         while (locinput <= ST.maxpos
5233                                && UCHARAT(locinput) != ST.c1
5234                                && UCHARAT(locinput) != ST.c2)
5235                             locinput++;
5236                     }
5237                     n = locinput - ST.oldloc;
5238                 }
5239                 if (locinput > ST.maxpos)
5240                     sayNO;
5241                 /* PL_reginput == oldloc now */
5242                 if (n) {
5243                     ST.count += n;
5244                     if (regrepeat(rex, ST.A, n, depth) < n)
5245                         sayNO;
5246                 }
5247                 PL_reginput = locinput;
5248                 CURLY_SETPAREN(ST.paren, ST.count);
5249                 if (cur_eval && cur_eval->u.eval.close_paren &&
5250                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5251                     goto fake_end;
5252                 }
5253                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5254             }
5255             /* NOTREACHED */
5256
5257
5258         case CURLY_B_min_fail:
5259             /* failed to find B in a non-greedy match where c1,c2 invalid */
5260             if (ST.paren && ST.count)
5261                 PL_regoffs[ST.paren].end = -1;
5262
5263             REGCP_UNWIND(ST.cp);
5264             /* failed -- move forward one */
5265             PL_reginput = locinput;
5266             if (regrepeat(rex, ST.A, 1, depth)) {
5267                 ST.count++;
5268                 locinput = PL_reginput;
5269                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5270                         ST.count > 0)) /* count overflow ? */
5271                 {
5272                   curly_try_B_min:
5273                     CURLY_SETPAREN(ST.paren, ST.count);
5274                     if (cur_eval && cur_eval->u.eval.close_paren &&
5275                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5276                         goto fake_end;
5277                     }
5278                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5279                 }
5280             }
5281             sayNO;
5282             /* NOTREACHED */
5283
5284
5285         curly_try_B_max:
5286             /* a successful greedy match: now try to match B */
5287             if (cur_eval && cur_eval->u.eval.close_paren &&
5288                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5289                 goto fake_end;
5290             }
5291             {
5292                 UV c = 0;
5293                 if (ST.c1 != CHRTEST_VOID)
5294                     c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5295                                            UTF8_MAXBYTES, 0, uniflags)
5296                                 : (UV) UCHARAT(PL_reginput);
5297                 /* If it could work, try it. */
5298                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5299                     CURLY_SETPAREN(ST.paren, ST.count);
5300                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5301                     /* NOTREACHED */
5302                 }
5303             }
5304             /* FALL THROUGH */
5305         case CURLY_B_max_fail:
5306             /* failed to find B in a greedy match */
5307             if (ST.paren && ST.count)
5308                 PL_regoffs[ST.paren].end = -1;
5309
5310             REGCP_UNWIND(ST.cp);
5311             /*  back up. */
5312             if (--ST.count < ST.min)
5313                 sayNO;
5314             PL_reginput = locinput = HOPc(locinput, -1);
5315             goto curly_try_B_max;
5316
5317 #undef ST
5318
5319         case END:
5320             fake_end:
5321             if (cur_eval) {
5322                 /* we've just finished A in /(??{A})B/; now continue with B */
5323                 I32 tmpix;
5324                 st->u.eval.toggle_reg_flags
5325                             = cur_eval->u.eval.toggle_reg_flags;
5326                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5327
5328                 st->u.eval.prev_rex = rex_sv;           /* inner */
5329                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5330                 rex = (struct regexp *)SvANY(rex_sv);
5331                 rexi = RXi_GET(rex);
5332                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5333                 ReREFCNT_inc(rex_sv);
5334                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5335
5336                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5337                 PL_reglastparen = &rex->lastparen;
5338                 PL_reglastcloseparen = &rex->lastcloseparen;
5339
5340                 REGCP_SET(st->u.eval.lastcp);
5341                 PL_reginput = locinput;
5342
5343                 /* Restore parens of the outer rex without popping the
5344                  * savestack */
5345                 tmpix = PL_savestack_ix;
5346                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5347                 regcppop(rex);
5348                 PL_savestack_ix = tmpix;
5349
5350                 st->u.eval.prev_eval = cur_eval;
5351                 cur_eval = cur_eval->u.eval.prev_eval;
5352                 DEBUG_EXECUTE_r(
5353                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5354                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5355                 if ( nochange_depth )
5356                     nochange_depth--;
5357
5358                 PUSH_YES_STATE_GOTO(EVAL_AB,
5359                         st->u.eval.prev_eval->u.eval.B); /* match B */
5360             }
5361
5362             if (locinput < reginfo->till) {
5363                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5364                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5365                                       PL_colors[4],
5366                                       (long)(locinput - PL_reg_starttry),
5367                                       (long)(reginfo->till - PL_reg_starttry),
5368                                       PL_colors[5]));
5369
5370                 sayNO_SILENT;           /* Cannot match: too short. */
5371             }
5372             PL_reginput = locinput;     /* put where regtry can find it */
5373             sayYES;                     /* Success! */
5374
5375         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5376             DEBUG_EXECUTE_r(
5377             PerlIO_printf(Perl_debug_log,
5378                 "%*s  %ssubpattern success...%s\n",
5379                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5380             PL_reginput = locinput;     /* put where regtry can find it */
5381             sayYES;                     /* Success! */
5382
5383 #undef  ST
5384 #define ST st->u.ifmatch
5385
5386         case SUSPEND:   /* (?>A) */
5387             ST.wanted = 1;
5388             PL_reginput = locinput;
5389             goto do_ifmatch;
5390
5391         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5392             ST.wanted = 0;
5393             goto ifmatch_trivial_fail_test;
5394
5395         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5396             ST.wanted = 1;
5397           ifmatch_trivial_fail_test:
5398             if (scan->flags) {
5399                 char * const s = HOPBACKc(locinput, scan->flags);
5400                 if (!s) {
5401                     /* trivial fail */
5402                     if (logical) {
5403                         logical = 0;
5404                         sw = 1 - cBOOL(ST.wanted);
5405                     }
5406                     else if (ST.wanted)
5407                         sayNO;
5408                     next = scan + ARG(scan);
5409                     if (next == scan)
5410                         next = NULL;
5411                     break;
5412                 }
5413                 PL_reginput = s;
5414             }
5415             else
5416                 PL_reginput = locinput;
5417
5418           do_ifmatch:
5419             ST.me = scan;
5420             ST.logical = logical;
5421             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5422
5423             /* execute body of (?...A) */
5424             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5425             /* NOTREACHED */
5426
5427         case IFMATCH_A_fail: /* body of (?...A) failed */
5428             ST.wanted = !ST.wanted;
5429             /* FALL THROUGH */
5430
5431         case IFMATCH_A: /* body of (?...A) succeeded */
5432             if (ST.logical) {
5433                 sw = cBOOL(ST.wanted);
5434             }
5435             else if (!ST.wanted)
5436                 sayNO;
5437
5438             if (OP(ST.me) == SUSPEND)
5439                 locinput = PL_reginput;
5440             else {
5441                 locinput = PL_reginput = st->locinput;
5442                 nextchr = UCHARAT(locinput);
5443             }
5444             scan = ST.me + ARG(ST.me);
5445             if (scan == ST.me)
5446                 scan = NULL;
5447             continue; /* execute B */
5448
5449 #undef ST
5450
5451         case LONGJMP:
5452             next = scan + ARG(scan);
5453             if (next == scan)
5454                 next = NULL;
5455             break;
5456         case COMMIT:
5457             reginfo->cutpoint = PL_regeol;
5458             /* FALLTHROUGH */
5459         case PRUNE:
5460             PL_reginput = locinput;
5461             if (!scan->flags)
5462                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5463             PUSH_STATE_GOTO(COMMIT_next,next);
5464             /* NOTREACHED */
5465         case COMMIT_next_fail:
5466             no_final = 1;
5467             /* FALLTHROUGH */
5468         case OPFAIL:
5469             sayNO;
5470             /* NOTREACHED */
5471
5472 #define ST st->u.mark
5473         case MARKPOINT:
5474             ST.prev_mark = mark_state;
5475             ST.mark_name = sv_commit = sv_yes_mark
5476                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5477             mark_state = st;
5478             ST.mark_loc = PL_reginput = locinput;
5479             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5480             /* NOTREACHED */
5481         case MARKPOINT_next:
5482             mark_state = ST.prev_mark;
5483             sayYES;
5484             /* NOTREACHED */
5485         case MARKPOINT_next_fail:
5486             if (popmark && sv_eq(ST.mark_name,popmark))
5487             {
5488                 if (ST.mark_loc > startpoint)
5489                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5490                 popmark = NULL; /* we found our mark */
5491                 sv_commit = ST.mark_name;
5492
5493                 DEBUG_EXECUTE_r({
5494                         PerlIO_printf(Perl_debug_log,
5495                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5496                             REPORT_CODE_OFF+depth*2, "",
5497                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5498                 });
5499             }
5500             mark_state = ST.prev_mark;
5501             sv_yes_mark = mark_state ?
5502                 mark_state->u.mark.mark_name : NULL;
5503             sayNO;
5504             /* NOTREACHED */
5505         case SKIP:
5506             PL_reginput = locinput;
5507             if (scan->flags) {
5508                 /* (*SKIP) : if we fail we cut here*/
5509                 ST.mark_name = NULL;
5510                 ST.mark_loc = locinput;
5511                 PUSH_STATE_GOTO(SKIP_next,next);
5512             } else {
5513                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5514                    otherwise do nothing.  Meaning we need to scan
5515                  */
5516                 regmatch_state *cur = mark_state;
5517                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5518
5519                 while (cur) {
5520                     if ( sv_eq( cur->u.mark.mark_name,
5521                                 find ) )
5522                     {
5523                         ST.mark_name = find;
5524                         PUSH_STATE_GOTO( SKIP_next, next );
5525                     }
5526                     cur = cur->u.mark.prev_mark;
5527                 }
5528             }
5529             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5530             break;
5531         case SKIP_next_fail:
5532             if (ST.mark_name) {
5533                 /* (*CUT:NAME) - Set up to search for the name as we
5534                    collapse the stack*/
5535                 popmark = ST.mark_name;
5536             } else {
5537                 /* (*CUT) - No name, we cut here.*/
5538                 if (ST.mark_loc > startpoint)
5539                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5540                 /* but we set sv_commit to latest mark_name if there
5541                    is one so they can test to see how things lead to this
5542                    cut */
5543                 if (mark_state)
5544                     sv_commit=mark_state->u.mark.mark_name;
5545             }
5546             no_final = 1;
5547             sayNO;
5548             /* NOTREACHED */
5549 #undef ST
5550         case FOLDCHAR:
5551             n = ARG(scan);
5552             if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5553                 locinput += ln;
5554             } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5555                 sayNO;
5556             } else  {
5557                 U8 folded[UTF8_MAXBYTES_CASE+1];
5558                 STRLEN foldlen;
5559                 const char * const l = locinput;
5560                 char *e = PL_regeol;
5561                 to_uni_fold(n, folded, &foldlen);
5562
5563                 if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5564                                l, &e, 0,  utf8_target)) {
5565                         sayNO;
5566                 }
5567                 locinput = e;
5568             }
5569             nextchr = UCHARAT(locinput);
5570             break;
5571         case LNBREAK:
5572             if ((n=is_LNBREAK(locinput,utf8_target))) {
5573                 locinput += n;
5574                 nextchr = UCHARAT(locinput);
5575             } else
5576                 sayNO;
5577             break;
5578
5579 #define CASE_CLASS(nAmE)                              \
5580         case nAmE:                                    \
5581             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5582                 locinput += n;                        \
5583                 nextchr = UCHARAT(locinput);          \
5584             } else                                    \
5585                 sayNO;                                \
5586             break;                                    \
5587         case N##nAmE:                                 \
5588             if ((n=is_##nAmE(locinput,utf8_target))) {    \
5589                 sayNO;                                \
5590             } else {                                  \
5591                 locinput += UTF8SKIP(locinput);       \
5592                 nextchr = UCHARAT(locinput);          \
5593             }                                         \
5594             break
5595
5596         CASE_CLASS(VERTWS);
5597         CASE_CLASS(HORIZWS);
5598 #undef CASE_CLASS
5599
5600         default:
5601             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5602                           PTR2UV(scan), OP(scan));
5603             Perl_croak(aTHX_ "regexp memory corruption");
5604
5605         } /* end switch */
5606
5607         /* switch break jumps here */
5608         scan = next; /* prepare to execute the next op and ... */
5609         continue;    /* ... jump back to the top, reusing st */
5610         /* NOTREACHED */
5611
5612       push_yes_state:
5613         /* push a state that backtracks on success */
5614         st->u.yes.prev_yes_state = yes_state;
5615         yes_state = st;
5616         /* FALL THROUGH */
5617       push_state:
5618         /* push a new regex state, then continue at scan  */
5619         {
5620             regmatch_state *newst;
5621
5622             DEBUG_STACK_r({
5623                 regmatch_state *cur = st;
5624                 regmatch_state *curyes = yes_state;
5625                 int curd = depth;
5626                 regmatch_slab *slab = PL_regmatch_slab;
5627                 for (;curd > -1;cur--,curd--) {
5628                     if (cur < SLAB_FIRST(slab)) {
5629                         slab = slab->prev;
5630                         cur = SLAB_LAST(slab);
5631                     }
5632                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5633                         REPORT_CODE_OFF + 2 + depth * 2,"",
5634                         curd, PL_reg_name[cur->resume_state],
5635                         (curyes == cur) ? "yes" : ""
5636                     );
5637                     if (curyes == cur)
5638                         curyes = cur->u.yes.prev_yes_state;
5639                 }
5640             } else
5641                 DEBUG_STATE_pp("push")
5642             );
5643             depth++;
5644             st->locinput = locinput;
5645             newst = st+1;
5646             if (newst >  SLAB_LAST(PL_regmatch_slab))
5647                 newst = S_push_slab(aTHX);
5648             PL_regmatch_state = newst;
5649
5650             locinput = PL_reginput;
5651             nextchr = UCHARAT(locinput);
5652             st = newst;
5653             continue;
5654             /* NOTREACHED */
5655         }
5656     }
5657
5658     /*
5659     * We get here only if there's trouble -- normally "case END" is
5660     * the terminating point.
5661     */
5662     Perl_croak(aTHX_ "corrupted regexp pointers");
5663     /*NOTREACHED*/
5664     sayNO;
5665
5666 yes:
5667     if (yes_state) {
5668         /* we have successfully completed a subexpression, but we must now
5669          * pop to the state marked by yes_state and continue from there */
5670         assert(st != yes_state);
5671 #ifdef DEBUGGING
5672         while (st != yes_state) {
5673             st--;
5674             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5675                 PL_regmatch_slab = PL_regmatch_slab->prev;
5676                 st = SLAB_LAST(PL_regmatch_slab);
5677             }
5678             DEBUG_STATE_r({
5679                 if (no_final) {
5680                     DEBUG_STATE_pp("pop (no final)");
5681                 } else {
5682                     DEBUG_STATE_pp("pop (yes)");
5683                 }
5684             });
5685             depth--;
5686         }
5687 #else
5688         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5689             || yes_state > SLAB_LAST(PL_regmatch_slab))
5690         {
5691             /* not in this slab, pop slab */
5692             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5693             PL_regmatch_slab = PL_regmatch_slab->prev;
5694             st = SLAB_LAST(PL_regmatch_slab);
5695         }
5696         depth -= (st - yes_state);
5697 #endif
5698         st = yes_state;
5699         yes_state = st->u.yes.prev_yes_state;
5700         PL_regmatch_state = st;
5701
5702         if (no_final) {
5703             locinput= st->locinput;
5704             nextchr = UCHARAT(locinput);
5705         }
5706         state_num = st->resume_state + no_final;
5707         goto reenter_switch;
5708     }
5709
5710     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5711                           PL_colors[4], PL_colors[5]));
5712
5713     if (PL_reg_eval_set) {
5714         /* each successfully executed (?{...}) block does the equivalent of
5715          *   local $^R = do {...}
5716          * When popping the save stack, all these locals would be undone;
5717          * bypass this by setting the outermost saved $^R to the latest
5718          * value */
5719         if (oreplsv != GvSV(PL_replgv))
5720             sv_setsv(oreplsv, GvSV(PL_replgv));
5721     }
5722     result = 1;
5723     goto final_exit;
5724
5725 no:
5726     DEBUG_EXECUTE_r(
5727         PerlIO_printf(Perl_debug_log,
5728             "%*s  %sfailed...%s\n",
5729             REPORT_CODE_OFF+depth*2, "",
5730             PL_colors[4], PL_colors[5])
5731         );
5732
5733 no_silent:
5734     if (no_final) {
5735         if (yes_state) {
5736             goto yes;
5737         } else {
5738             goto final_exit;
5739         }
5740     }
5741     if (depth) {
5742         /* there's a previous state to backtrack to */
5743         st--;
5744         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5745             PL_regmatch_slab = PL_regmatch_slab->prev;
5746             st = SLAB_LAST(PL_regmatch_slab);
5747         }
5748         PL_regmatch_state = st;
5749         locinput= st->locinput;
5750         nextchr = UCHARAT(locinput);
5751
5752         DEBUG_STATE_pp("pop");
5753         depth--;
5754         if (yes_state == st)
5755             yes_state = st->u.yes.prev_yes_state;
5756
5757         state_num = st->resume_state + 1; /* failure = success + 1 */
5758         goto reenter_switch;
5759     }
5760     result = 0;
5761
5762   final_exit:
5763     if (rex->intflags & PREGf_VERBARG_SEEN) {
5764         SV *sv_err = get_sv("REGERROR", 1);
5765         SV *sv_mrk = get_sv("REGMARK", 1);
5766         if (result) {
5767             sv_commit = &PL_sv_no;
5768             if (!sv_yes_mark)
5769                 sv_yes_mark = &PL_sv_yes;
5770         } else {
5771             if (!sv_commit)
5772                 sv_commit = &PL_sv_yes;
5773             sv_yes_mark = &PL_sv_no;
5774         }
5775         sv_setsv(sv_err, sv_commit);
5776         sv_setsv(sv_mrk, sv_yes_mark);
5777     }
5778
5779     /* clean up; in particular, free all slabs above current one */
5780     LEAVE_SCOPE(oldsave);
5781
5782     return result;
5783 }
5784
5785 /*
5786  - regrepeat - repeatedly match something simple, report how many
5787  */
5788 /*
5789  * [This routine now assumes that it will only match on things of length 1.
5790  * That was true before, but now we assume scan - reginput is the count,
5791  * rather than incrementing count on every character.  [Er, except utf8.]]
5792  */
5793 STATIC I32
5794 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5795 {
5796     dVAR;
5797     register char *scan;
5798     register I32 c;
5799     register char *loceol = PL_regeol;
5800     register I32 hardcount = 0;
5801     register bool utf8_target = PL_reg_match_utf8;
5802 #ifndef DEBUGGING
5803     PERL_UNUSED_ARG(depth);
5804 #endif
5805
5806     PERL_ARGS_ASSERT_REGREPEAT;
5807
5808     scan = PL_reginput;
5809     if (max == REG_INFTY)
5810         max = I32_MAX;
5811     else if (max < loceol - scan)
5812         loceol = scan + max;
5813     switch (OP(p)) {
5814     case REG_ANY:
5815         if (utf8_target) {
5816             loceol = PL_regeol;
5817             while (scan < loceol && hardcount < max && *scan != '\n') {
5818                 scan += UTF8SKIP(scan);
5819                 hardcount++;
5820             }
5821         } else {
5822             while (scan < loceol && *scan != '\n')
5823                 scan++;
5824         }
5825         break;
5826     case SANY:
5827         if (utf8_target) {
5828             loceol = PL_regeol;
5829             while (scan < loceol && hardcount < max) {
5830                 scan += UTF8SKIP(scan);
5831                 hardcount++;
5832             }
5833         }
5834         else
5835             scan = loceol;
5836         break;
5837     case CANY:
5838         scan = loceol;
5839         break;
5840     case EXACT:
5841         /* To get here, EXACTish nodes must have *byte* length == 1.  That
5842          * means they match only characters in the string that can be expressed
5843          * as a single byte.  For non-utf8 strings, that means a simple match.
5844          * For utf8 strings, the character matched must be an invariant, or
5845          * downgradable to a single byte.  The pattern's utf8ness is
5846          * irrelevant, as since it's a single byte, it either isn't utf8, or if
5847          * it is, it's an invariant */
5848
5849         c = (U8)*STRING(p);
5850         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5851
5852         if (! utf8_target || UNI_IS_INVARIANT(c)) {
5853             while (scan < loceol && UCHARAT(scan) == c) {
5854                 scan++;
5855             }
5856         }
5857         else {
5858
5859             /* Here, the string is utf8, and the pattern char is different
5860              * in utf8 than not, so can't compare them directly.  Outside the
5861              * loop, find find the two utf8 bytes that represent c, and then
5862              * look for those in sequence in the utf8 string */
5863             U8 high = UTF8_TWO_BYTE_HI(c);
5864             U8 low = UTF8_TWO_BYTE_LO(c);
5865             loceol = PL_regeol;
5866
5867             while (hardcount < max
5868                     && scan + 1 < loceol
5869                     && UCHARAT(scan) == high
5870                     && UCHARAT(scan + 1) == low)
5871             {
5872                 scan += 2;
5873                 hardcount++;
5874             }
5875         }
5876         break;
5877     case EXACTFL:
5878         PL_reg_flags |= RF_tainted;
5879         /* FALL THROUGH */
5880     case EXACTF:
5881     case EXACTFU:
5882
5883         /* The comments for the EXACT case above apply as well to these fold
5884          * ones */
5885
5886         c = (U8)*STRING(p);
5887         assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5888
5889         if (utf8_target) { /* Use full Unicode fold matching */
5890
5891             /* For the EXACTFL case, It doesn't really make sense to compare
5892              * locale and utf8, but it is best we can do.  The documents warn
5893              * against mixing them */
5894
5895             char *tmpeol = loceol;
5896             while (hardcount < max
5897                     && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
5898                                    STRING(p), NULL, 1, cBOOL(UTF_PATTERN)))
5899             {
5900                 scan = tmpeol;
5901                 tmpeol = loceol;
5902                 hardcount++;
5903             }
5904
5905             /* XXX Note that the above handles properly the German sharp s in
5906              * the pattern matching ss in the string.  But it doesn't handle
5907              * properly cases where the string contains say 'LIGATURE ff' and
5908              * the pattern is 'f+'.  This would require, say, a new function or
5909              * revised interface to foldEQ_utf8(), in which the maximum number
5910              * of characters to match could be passed and it would return how
5911              * many actually did.  This is just one of many cases where
5912              * multi-char folds don't work properly, and so the fix is being
5913              * deferred */
5914         }
5915         else {
5916             U8 folded;
5917
5918             /* Here, the string isn't utf8 and c is a single byte; and either
5919              * the pattern isn't utf8 or c is an invariant, so its utf8ness
5920              * doesn't affect c.  Can just do simple comparisons for exact or
5921              * fold matching. */
5922             switch (OP(p)) {
5923                 case EXACTF: folded = PL_fold[c]; break;
5924                 case EXACTFU: folded = PL_fold_latin1[c]; break;
5925                 case EXACTFL: folded = PL_fold_locale[c]; break;
5926                 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
5927             }
5928             while (scan < loceol &&
5929                    (UCHARAT(scan) == c || UCHARAT(scan) == folded))
5930             {
5931                 scan++;
5932             }
5933         }
5934         break;
5935     case ANYOF:
5936         if (utf8_target) {
5937             loceol = PL_regeol;
5938             while (hardcount < max && scan < loceol &&
5939                    reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
5940                 scan += UTF8SKIP(scan);
5941                 hardcount++;
5942             }
5943         } else {
5944             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5945                 scan++;
5946         }
5947         break;
5948     case ALNUM:
5949         if (utf8_target) {
5950             loceol = PL_regeol;
5951             LOAD_UTF8_CHARCLASS_ALNUM();
5952             while (hardcount < max && scan < loceol &&
5953                    swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5954             {
5955                 scan += UTF8SKIP(scan);
5956                 hardcount++;
5957             }
5958         } else if (FLAGS(p) & USE_UNI) {
5959             while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
5960                 scan++;
5961             }
5962         } else {
5963             while (scan < loceol && isALNUM((U8) *scan)) {
5964                 scan++;
5965             }
5966         }
5967         break;
5968     case ALNUML:
5969         PL_reg_flags |= RF_tainted;
5970         if (utf8_target) {
5971             loceol = PL_regeol;
5972             while (hardcount < max && scan < loceol &&
5973                    isALNUM_LC_utf8((U8*)scan)) {
5974                 scan += UTF8SKIP(scan);
5975                 hardcount++;
5976             }
5977         } else {
5978             while (scan < loceol && isALNUM_LC(*scan))
5979                 scan++;
5980         }
5981         break;
5982     case NALNUM:
5983         if (utf8_target) {
5984             loceol = PL_regeol;
5985             LOAD_UTF8_CHARCLASS_ALNUM();
5986             while (hardcount < max && scan < loceol &&
5987                    !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5988             {
5989                 scan += UTF8SKIP(scan);
5990                 hardcount++;
5991             }
5992         } else if (FLAGS(p) & USE_UNI) {
5993             while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
5994                 scan++;
5995             }
5996         } else {
5997             while (scan < loceol && ! isALNUM((U8) *scan)) {
5998                 scan++;
5999             }
6000         }
6001         break;
6002     case NALNUML:
6003         PL_reg_flags |= RF_tainted;
6004         if (utf8_target) {
6005             loceol = PL_regeol;
6006             while (hardcount < max && scan < loceol &&
6007                    !isALNUM_LC_utf8((U8*)scan)) {
6008                 scan += UTF8SKIP(scan);
6009                 hardcount++;
6010             }
6011         } else {
6012             while (scan < loceol && !isALNUM_LC(*scan))
6013                 scan++;
6014         }
6015         break;
6016     case SPACE:
6017         if (utf8_target) {
6018             loceol = PL_regeol;
6019             LOAD_UTF8_CHARCLASS_SPACE();
6020             while (hardcount < max && scan < loceol &&
6021                    (*scan == ' ' ||
6022                     swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6023             {
6024                 scan += UTF8SKIP(scan);
6025                 hardcount++;
6026             }
6027         } else if (FLAGS(p) & USE_UNI) {
6028             while (scan < loceol && isSPACE_L1((U8) *scan)) {
6029                 scan++;
6030             }
6031         } else {
6032             while (scan < loceol && isSPACE((U8) *scan))
6033                 scan++;
6034         }
6035         break;
6036     case SPACEL:
6037         PL_reg_flags |= RF_tainted;
6038         if (utf8_target) {
6039             loceol = PL_regeol;
6040             while (hardcount < max && scan < loceol &&
6041                    isSPACE_LC_utf8((U8*)scan)) {
6042                 scan += UTF8SKIP(scan);
6043                 hardcount++;
6044             }
6045         } else {
6046             while (scan < loceol && isSPACE_LC(*scan))
6047                 scan++;
6048         }
6049         break;
6050     case NSPACE:
6051         if (utf8_target) {
6052             loceol = PL_regeol;
6053             LOAD_UTF8_CHARCLASS_SPACE();
6054             while (hardcount < max && scan < loceol &&
6055                    !(*scan == ' ' ||
6056                      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6057             {
6058                 scan += UTF8SKIP(scan);
6059                 hardcount++;
6060             }
6061         } else if (FLAGS(p) & USE_UNI) {
6062             while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6063                 scan++;
6064             }
6065         } else {
6066             while (scan < loceol && ! isSPACE((U8) *scan)) {
6067                 scan++;
6068             }
6069         }
6070         break;
6071     case NSPACEL:
6072         PL_reg_flags |= RF_tainted;
6073         if (utf8_target) {
6074             loceol = PL_regeol;
6075             while (hardcount < max && scan < loceol &&
6076                    !isSPACE_LC_utf8((U8*)scan)) {
6077                 scan += UTF8SKIP(scan);
6078                 hardcount++;
6079             }
6080         } else {
6081             while (scan < loceol && !isSPACE_LC(*scan))
6082                 scan++;
6083         }
6084         break;
6085     case DIGIT:
6086         if (utf8_target) {
6087             loceol = PL_regeol;
6088             LOAD_UTF8_CHARCLASS_DIGIT();
6089             while (hardcount < max && scan < loceol &&
6090                    swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6091                 scan += UTF8SKIP(scan);
6092                 hardcount++;
6093             }
6094         } else {
6095             while (scan < loceol && isDIGIT(*scan))
6096                 scan++;
6097         }
6098         break;
6099     case DIGITL:
6100         PL_reg_flags |= RF_tainted;
6101         if (utf8_target) {
6102             loceol = PL_regeol;
6103             while (hardcount < max && scan < loceol &&
6104                    isDIGIT_LC_utf8((U8*)scan)) {
6105                 scan += UTF8SKIP(scan);
6106                 hardcount++;
6107             }
6108         } else {
6109             while (scan < loceol && isDIGIT_LC(*scan))
6110                 scan++;
6111         }
6112         break;
6113     case NDIGIT:
6114         if (utf8_target) {
6115             loceol = PL_regeol;
6116             LOAD_UTF8_CHARCLASS_DIGIT();
6117             while (hardcount < max && scan < loceol &&
6118                    !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6119                 scan += UTF8SKIP(scan);
6120                 hardcount++;
6121             }
6122         } else {
6123             while (scan < loceol && !isDIGIT(*scan))
6124                 scan++;
6125         }
6126     case NDIGITL:
6127         PL_reg_flags |= RF_tainted;
6128         if (utf8_target) {
6129             loceol = PL_regeol;
6130             while (hardcount < max && scan < loceol &&
6131                    !isDIGIT_LC_utf8((U8*)scan)) {
6132                 scan += UTF8SKIP(scan);
6133                 hardcount++;
6134             }
6135         } else {
6136             while (scan < loceol && !isDIGIT_LC(*scan))
6137                 scan++;
6138         }
6139         break;
6140     case LNBREAK:
6141         if (utf8_target) {
6142             loceol = PL_regeol;
6143             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6144                 scan += c;
6145                 hardcount++;
6146             }
6147         } else {
6148             /*
6149               LNBREAK can match two latin chars, which is ok,
6150               because we have a null terminated string, but we
6151               have to use hardcount in this situation
6152             */
6153             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6154                 scan+=c;
6155                 hardcount++;
6156             }
6157         }
6158         break;
6159     case HORIZWS:
6160         if (utf8_target) {
6161             loceol = PL_regeol;
6162             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6163                 scan += c;
6164                 hardcount++;
6165             }
6166         } else {
6167             while (scan < loceol && is_HORIZWS_latin1(scan))
6168                 scan++;
6169         }
6170         break;
6171     case NHORIZWS:
6172         if (utf8_target) {
6173             loceol = PL_regeol;
6174             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6175                 scan += UTF8SKIP(scan);
6176                 hardcount++;
6177             }
6178         } else {
6179             while (scan < loceol && !is_HORIZWS_latin1(scan))
6180                 scan++;
6181
6182         }
6183         break;
6184     case VERTWS:
6185         if (utf8_target) {
6186             loceol = PL_regeol;
6187             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6188                 scan += c;
6189                 hardcount++;
6190             }
6191         } else {
6192             while (scan < loceol && is_VERTWS_latin1(scan))
6193                 scan++;
6194
6195         }
6196         break;
6197     case NVERTWS:
6198         if (utf8_target) {
6199             loceol = PL_regeol;
6200             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6201                 scan += UTF8SKIP(scan);
6202                 hardcount++;
6203             }
6204         } else {
6205             while (scan < loceol && !is_VERTWS_latin1(scan))
6206                 scan++;
6207
6208         }
6209         break;
6210
6211     default:            /* Called on something of 0 width. */
6212         break;          /* So match right here or not at all. */
6213     }
6214
6215     if (hardcount)
6216         c = hardcount;
6217     else
6218         c = scan - PL_reginput;
6219     PL_reginput = scan;
6220
6221     DEBUG_r({
6222         GET_RE_DEBUG_FLAGS_DECL;
6223         DEBUG_EXECUTE_r({
6224             SV * const prop = sv_newmortal();
6225             regprop(prog, prop, p);
6226             PerlIO_printf(Perl_debug_log,
6227                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6228                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6229         });
6230     });
6231
6232     return(c);
6233 }
6234
6235
6236 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6237 /*
6238 - regclass_swash - prepare the utf8 swash
6239 */
6240
6241 SV *
6242 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6243 {
6244     dVAR;
6245     SV *sw  = NULL;
6246     SV *si  = NULL;
6247     SV *alt = NULL;
6248     RXi_GET_DECL(prog,progi);
6249     const struct reg_data * const data = prog ? progi->data : NULL;
6250
6251     PERL_ARGS_ASSERT_REGCLASS_SWASH;
6252
6253     if (data && data->count) {
6254         const U32 n = ARG(node);
6255
6256         if (data->what[n] == 's') {
6257             SV * const rv = MUTABLE_SV(data->data[n]);
6258             AV * const av = MUTABLE_AV(SvRV(rv));
6259             SV **const ary = AvARRAY(av);
6260             SV **a, **b;
6261
6262             /* See the end of regcomp.c:S_regclass() for
6263              * documentation of these array elements. */
6264
6265             si = *ary;
6266             a  = SvROK(ary[1]) ? &ary[1] : NULL;
6267             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6268
6269             if (a)
6270                 sw = *a;
6271             else if (si && doinit) {
6272                 sw = swash_init("utf8", "", si, 1, 0);
6273                 (void)av_store(av, 1, sw);
6274             }
6275             if (b)
6276                 alt = *b;
6277         }
6278     }
6279
6280     if (listsvp)
6281         *listsvp = si;
6282     if (altsvp)
6283         *altsvp  = alt;
6284
6285     return sw;
6286 }
6287 #endif
6288
6289 /*
6290  - reginclass - determine if a character falls into a character class
6291
6292   n is the ANYOF regnode
6293   p is the target string
6294   lenp is pointer to the maximum number of bytes of how far to go in p
6295     (This is assumed wthout checking to always be at least the current
6296     character's size)
6297   utf8_target tells whether p is in UTF-8.
6298
6299   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6300   from a successful match, the value it points to will be updated to how many
6301   bytes in p were matched.  If there was no match, the value is undefined,
6302   possibly changed from the input.
6303
6304  */
6305
6306 STATIC bool
6307 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6308 {
6309     dVAR;
6310     const char flags = ANYOF_FLAGS(n);
6311     bool match = FALSE;
6312     UV c = *p;
6313     STRLEN c_len = 0;
6314     STRLEN maxlen;
6315
6316     PERL_ARGS_ASSERT_REGINCLASS;
6317
6318     /* If c is not already the code point, get it */
6319     if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6320         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6321                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6322                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6323                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6324                  * UTF8_ALLOW_FFFF */
6325         if (c_len == (STRLEN)-1)
6326             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6327     }
6328     else {
6329         c_len = 1;
6330     }
6331
6332     /* Use passed in max length, or one character if none passed in or less
6333      * than one character.  And assume will match just one character.  This is
6334      * overwritten later if matched more. */
6335     if (lenp) {
6336         maxlen = (*lenp > c_len) ? *lenp : c_len;
6337         *lenp = c_len;
6338
6339     }
6340     else {
6341         maxlen = c_len;
6342     }
6343
6344     /* If this character is potentially in the bitmap, check it */
6345     if (c < 256) {
6346         if (ANYOF_BITMAP_TEST(n, c))
6347             match = TRUE;
6348         else if (flags & ANYOF_FOLD) {
6349             U8 f;
6350
6351             if (flags & ANYOF_LOCALE) {
6352                 PL_reg_flags |= RF_tainted;
6353                 f = PL_fold_locale[c];
6354             }
6355             else
6356                 f = PL_fold[c];
6357             if (f != c && ANYOF_BITMAP_TEST(n, f))
6358                 match = TRUE;
6359         }
6360
6361         if (!match && ANYOF_CLASS_TEST_ANY_SET(n)) {
6362             PL_reg_flags |= RF_tainted;     /* CLASS implies LOCALE */
6363             if (
6364                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6365                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6366                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6367                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6368                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6369                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6370                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6371                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6372                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6373                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6374                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6375                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6376                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6377                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6378                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6379                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6380                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6381                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6382                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6383                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6384                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6385                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6386                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6387                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6388                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6389                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6390                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6391                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6392                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6393                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6394                 ) /* How's that for a conditional? */
6395             {
6396                 match = TRUE;
6397             }
6398         }
6399     }
6400
6401     /* If the bitmap didn't (or couldn't) match, and something outside the
6402      * bitmap could match, try that */
6403     if (!match) {
6404         if (utf8_target && (flags & ANYOF_UNICODE_ALL)) {
6405             if (c >= 256
6406                 || ((flags & ANYOF_FOLD) /* Latin1 1 that has a non-Latin1 fold
6407                                             should match */
6408                     && _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c)))
6409             {
6410                 match = TRUE;
6411             }
6412         }
6413         if (!match && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6414                        || (utf8_target && flags & ANYOF_UTF8)))
6415         {
6416             AV *av;
6417             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6418
6419             if (sw) {
6420                 U8 * utf8_p;
6421                 if (utf8_target) {
6422                     utf8_p = (U8 *) p;
6423                 } else {
6424                     STRLEN len = 1;
6425                     utf8_p = bytes_to_utf8(p, &len);
6426                 }
6427                 if (swash_fetch(sw, utf8_p, 1))
6428                     match = TRUE;
6429                 else if (flags & ANYOF_FOLD) {
6430                     if (!match && lenp && av) {
6431                         I32 i;
6432                         for (i = 0; i <= av_len(av); i++) {
6433                             SV* const sv = *av_fetch(av, i, FALSE);
6434                             STRLEN len;
6435                             const char * const s = SvPV_const(sv, len);
6436                             if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) {
6437                                 *lenp = len;
6438                                 match = TRUE;
6439                                 break;
6440                             }
6441                         }
6442                     }
6443                     if (!match) { /* See if the folded version matches */
6444                         U8 folded[UTF8_MAXBYTES_CASE+1];
6445                         SV** listp;
6446                         STRLEN foldlen;
6447
6448                         to_utf8_fold(utf8_p, folded, &foldlen);
6449
6450                         /* Consider "k" =~ /[K]/i.  The line above would have
6451                          * just folded the 'k' to itself, and that isn't going
6452                          * to match 'K'.  So we look through the closure of
6453                          * everything that folds to 'k'.  That will find the
6454                          * 'K'.  Initialize the list, if necessary */
6455                         if (! PL_utf8_foldclosures) {
6456
6457                             /* If the folds haven't been read in, call a fold
6458                              * function to force that */
6459                             if (! PL_utf8_tofold) {
6460                                 U8 dummy[UTF8_MAXBYTES+1];
6461                                 STRLEN dummy_len;
6462                                 to_utf8_fold((U8*) "A", dummy, &dummy_len);
6463                             }
6464                             PL_utf8_foldclosures =
6465                                   _swash_inversion_hash(PL_utf8_tofold);
6466                         }
6467
6468                         /* The data structure is a hash with the keys every
6469                          * character that is folded to, like 'k', and the
6470                          * values each an array of everything that folds to its
6471                          * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
6472                         if ((listp = hv_fetch(PL_utf8_foldclosures,
6473                                       (char *) folded, foldlen, FALSE)))
6474                         {
6475                             AV* list = (AV*) *listp;
6476                             IV i;
6477                             for (i = 0; i <= av_len(list); i++) {
6478                                 SV** try_p = av_fetch(list, i, FALSE);
6479                                 char* try_c;
6480                                 if (try_p == NULL) {
6481                                     Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
6482                                 }
6483                                 /* Don't have to worry about embeded nulls
6484                                  * since NULL isn't folded or foldable */
6485                                 try_c = SvPVX(*try_p);
6486
6487                                 /* The fold in a few cases  of an above Latin1
6488                                  * char is in the Latin1 range, and hence may
6489                                  * be in the bitmap */
6490                                 if (UTF8_IS_INVARIANT(*try_c)
6491                                     && ANYOF_BITMAP_TEST(n,
6492                                                     UNI_TO_NATIVE(*try_c)))
6493                                 {
6494                                     match = TRUE;
6495                                     break;
6496                                 }
6497                                 else if
6498                                     (UTF8_IS_DOWNGRADEABLE_START(*try_c)
6499                                      && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(
6500                                                 TWO_BYTE_UTF8_TO_UNI(try_c[0],
6501                                                                     try_c[1]))))
6502                                 {
6503                                    /* Since the fold comes from internally
6504                                     * generated data, we can safely assume it
6505                                     * is valid utf8 in the test above */
6506                                     match = TRUE;
6507                                     break;
6508                                 } else if (swash_fetch(sw, (U8*) try_c, 1)) {
6509                                     match = TRUE;
6510                                     break;
6511                                 }
6512                             }
6513                         }
6514                     }
6515                 }
6516
6517                 /* If we allocated a string above, free it */
6518                 if (! utf8_target) Safefree(utf8_p);
6519             }
6520         }
6521     }
6522
6523     return (flags & ANYOF_INVERT) ? !match : match;
6524 }
6525
6526 STATIC U8 *
6527 S_reghop3(U8 *s, I32 off, const U8* lim)
6528 {
6529     dVAR;
6530
6531     PERL_ARGS_ASSERT_REGHOP3;
6532
6533     if (off >= 0) {
6534         while (off-- && s < lim) {
6535             /* XXX could check well-formedness here */
6536             s += UTF8SKIP(s);
6537         }
6538     }
6539     else {
6540         while (off++ && s > lim) {
6541             s--;
6542             if (UTF8_IS_CONTINUED(*s)) {
6543                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6544                     s--;
6545             }
6546             /* XXX could check well-formedness here */
6547         }
6548     }
6549     return s;
6550 }
6551
6552 #ifdef XXX_dmq
6553 /* there are a bunch of places where we use two reghop3's that should
6554    be replaced with this routine. but since thats not done yet
6555    we ifdef it out - dmq
6556 */
6557 STATIC U8 *
6558 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6559 {
6560     dVAR;
6561
6562     PERL_ARGS_ASSERT_REGHOP4;
6563
6564     if (off >= 0) {
6565         while (off-- && s < rlim) {
6566             /* XXX could check well-formedness here */
6567             s += UTF8SKIP(s);
6568         }
6569     }
6570     else {
6571         while (off++ && s > llim) {
6572             s--;
6573             if (UTF8_IS_CONTINUED(*s)) {
6574                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6575                     s--;
6576             }
6577             /* XXX could check well-formedness here */
6578         }
6579     }
6580     return s;
6581 }
6582 #endif
6583
6584 STATIC U8 *
6585 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6586 {
6587     dVAR;
6588
6589     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6590
6591     if (off >= 0) {
6592         while (off-- && s < lim) {
6593             /* XXX could check well-formedness here */
6594             s += UTF8SKIP(s);
6595         }
6596         if (off >= 0)
6597             return NULL;
6598     }
6599     else {
6600         while (off++ && s > lim) {
6601             s--;
6602             if (UTF8_IS_CONTINUED(*s)) {
6603                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6604                     s--;
6605             }
6606             /* XXX could check well-formedness here */
6607         }
6608         if (off <= 0)
6609             return NULL;
6610     }
6611     return s;
6612 }
6613
6614 static void
6615 restore_pos(pTHX_ void *arg)
6616 {
6617     dVAR;
6618     regexp * const rex = (regexp *)arg;
6619     if (PL_reg_eval_set) {
6620         if (PL_reg_oldsaved) {
6621             rex->subbeg = PL_reg_oldsaved;
6622             rex->sublen = PL_reg_oldsavedlen;
6623 #ifdef PERL_OLD_COPY_ON_WRITE
6624             rex->saved_copy = PL_nrs;
6625 #endif
6626             RXp_MATCH_COPIED_on(rex);
6627         }
6628         PL_reg_magic->mg_len = PL_reg_oldpos;
6629         PL_reg_eval_set = 0;
6630         PL_curpm = PL_reg_oldcurpm;
6631     }
6632 }
6633
6634 STATIC void
6635 S_to_utf8_substr(pTHX_ register regexp *prog)
6636 {
6637     int i = 1;
6638
6639     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6640
6641     do {
6642         if (prog->substrs->data[i].substr
6643             && !prog->substrs->data[i].utf8_substr) {
6644             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6645             prog->substrs->data[i].utf8_substr = sv;
6646             sv_utf8_upgrade(sv);
6647             if (SvVALID(prog->substrs->data[i].substr)) {
6648                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6649                 if (flags & FBMcf_TAIL) {
6650                     /* Trim the trailing \n that fbm_compile added last
6651                        time.  */
6652                     SvCUR_set(sv, SvCUR(sv) - 1);
6653                     /* Whilst this makes the SV technically "invalid" (as its
6654                        buffer is no longer followed by "\0") when fbm_compile()
6655                        adds the "\n" back, a "\0" is restored.  */
6656                 }
6657                 fbm_compile(sv, flags);
6658             }
6659             if (prog->substrs->data[i].substr == prog->check_substr)
6660                 prog->check_utf8 = sv;
6661         }
6662     } while (i--);
6663 }
6664
6665 STATIC void
6666 S_to_byte_substr(pTHX_ register regexp *prog)
6667 {
6668     dVAR;
6669     int i = 1;
6670
6671     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6672
6673     do {
6674         if (prog->substrs->data[i].utf8_substr
6675             && !prog->substrs->data[i].substr) {
6676             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6677             if (sv_utf8_downgrade(sv, TRUE)) {
6678                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6679                     const U8 flags
6680                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6681                     if (flags & FBMcf_TAIL) {
6682                         /* Trim the trailing \n that fbm_compile added last
6683                            time.  */
6684                         SvCUR_set(sv, SvCUR(sv) - 1);
6685                     }
6686                     fbm_compile(sv, flags);
6687                 }
6688             } else {
6689                 SvREFCNT_dec(sv);
6690                 sv = &PL_sv_undef;
6691             }
6692             prog->substrs->data[i].substr = sv;
6693             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6694                 prog->check_substr = sv;
6695         }
6696     } while (i--);
6697 }
6698
6699 /*
6700  * Local variables:
6701  * c-indentation-style: bsd
6702  * c-basic-offset: 4
6703  * indent-tabs-mode: t
6704  * End:
6705  *
6706  * ex: set ts=8 sts=4 sw=4 noet:
6707  */