src/5012001/orig/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *      One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  *      Copyright (c) 1986 by University of Toronto.
  44  *      Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  *      Permission is granted to anyone to use this software for any
  47  *      purpose on any computer system, and to redistribute it freely,
  48  *      subject to the following restrictions:
  49  *
  50  *      1. The author is not responsible for the consequences of use of
  51  *              this software, no matter how awful, even if they arise
  52  *              from defects in it.
  53  *
  54  *      2. The origin of this software must not be misrepresented, either
  55  *              by explicit claim or by omission.
  56  *
  57  *      3. Altered versions must be plainly marked as such, and must not
  58  *              be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76
  77 #ifdef PERL_IN_XSUB_RE
  78 #  include "re_comp.h"
  79 #else
  80 #  include "regcomp.h"
  81 #endif
  82
  83 #define RF_tainted      1               /* tainted information used? */
  84 #define RF_warned       2               /* warned about big count? */
  85
  86 #define RF_utf8         8               /* Pattern contains multibyte chars? */
  87
  88 #define UTF ((PL_reg_flags & RF_utf8) != 0)
  89
  90 #define RS_init         1               /* eval environment created */
  91 #define RS_set          2               /* replsv value is set */
  92
  93 #ifndef STATIC
  94 #define STATIC  static
  95 #endif
  96
  97 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
  98
  99 /*
 100  * Forwards.
 101  */
 102
 103 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
 104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 105
 106 #define HOPc(pos,off) \
 107         (char *)(PL_reg_match_utf8 \
 108             ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 109             : (U8*)(pos + off))
 110 #define HOPBACKc(pos, off) \
 111         (char*)(PL_reg_match_utf8\
 112             ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 113             : (pos - off >= PL_bostr)           \
 114                 ? (U8*)pos - off                \
 115                 : NULL)
 116
 117 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 118 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 119
 120 /* these are unrolled below in the CCC_TRY_XXX defined */
 121 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 122     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 123
 124 /* Doesn't do an assert to verify that is correct */
 125 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 126     if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 127
 128 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 129 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 130 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 131
 132 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 133         LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 134         LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 135         /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 136             * assert should likely and hopefully fail on an EBCDIC machine */ \
 137         LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 138                                                                             \
 139         /* No asserts are done for these, in case called on an early        \
 140             * Unicode version in which they map to nothing */               \
 141         LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 142         LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);          /* U+1100 "\xe1\x84\x80" */ \
 143         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 144         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 145         LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 146         LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 147         LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 148
 149 /*
 150    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 151    so that it is possible to override the option here without having to
 152    rebuild the entire core. as we are required to do if we change regcomp.h
 153    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 154 */
 155 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 156 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 157 #endif
 158
 159 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 160 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 161 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 162 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 163 #define RE_utf8_perl_word   PL_utf8_alnum
 164 #define RE_utf8_perl_space  PL_utf8_space
 165 #define RE_utf8_posix_digit PL_utf8_digit
 166 #define perl_word  alnum
 167 #define perl_space space
 168 #define posix_digit digit
 169 #else
 170 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 171 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 172 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 173 #define RE_utf8_perl_word   PL_utf8_perl_word
 174 #define RE_utf8_perl_space  PL_utf8_perl_space
 175 #define RE_utf8_posix_digit PL_utf8_posix_digit
 176 #endif
 177
 178
 179 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
 180         case NAMEL:                                                              \
 181             PL_reg_flags |= RF_tainted;                                                 \
 182             /* FALL THROUGH */                                                          \
 183         case NAME:                                                                     \
 184             if (!nextchr)                                                               \
 185                 sayNO;                                                                  \
 186             if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 187                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 188                     bool ok;                                                            \
 189                     ENTER;                                                              \
 190                     save_re_context();                                                  \
 191                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 192                     assert(ok);                                                         \
 193                     LEAVE;                                                              \
 194                 }                                                                       \
 195                 if (!(OP(scan) == NAME                                                  \
 196                     ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)   \
 197                     : LCFUNC_utf8((U8*)locinput)))                                      \
 198                 {                                                                       \
 199                     sayNO;                                                              \
 200                 }                                                                       \
 201                 locinput += PL_utf8skip[nextchr];                                       \
 202                 nextchr = UCHARAT(locinput);                                            \
 203                 break;                                                                  \
 204             }                                                                           \
 205             if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
 206                 sayNO;                                                                  \
 207             nextchr = UCHARAT(++locinput);                                              \
 208             break
 209
 210 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
 211         case NAMEL:                                                              \
 212             PL_reg_flags |= RF_tainted;                                                 \
 213             /* FALL THROUGH */                                                          \
 214         case NAME :                                                                     \
 215             if (!nextchr && locinput >= PL_regeol)                                      \
 216                 sayNO;                                                                  \
 217             if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 218                 if (!CAT2(PL_utf8_,CLASS)) {                                            \
 219                     bool ok;                                                            \
 220                     ENTER;                                                              \
 221                     save_re_context();                                                  \
 222                     ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 223                     assert(ok);                                                         \
 224                     LEAVE;                                                              \
 225                 }                                                                       \
 226                 if ((OP(scan) == NAME                                                  \
 227                     ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)    \
 228                     : LCFUNC_utf8((U8*)locinput)))                                      \
 229                 {                                                                       \
 230                     sayNO;                                                              \
 231                 }                                                                       \
 232                 locinput += PL_utf8skip[nextchr];                                       \
 233                 nextchr = UCHARAT(locinput);                                            \
 234                 break;                                                                  \
 235             }                                                                           \
 236             if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
 237                 sayNO;                                                                  \
 238             nextchr = UCHARAT(++locinput);                                              \
 239             break
 240
 241
 242
 243
 244
 245 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 246
 247 /* for use after a quantifier and before an EXACT-like node -- japhy */
 248 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
 249 #define JUMPABLE(rn) (      \
 250     OP(rn) == OPEN ||       \
 251     (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 252     OP(rn) == EVAL ||   \
 253     OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 254     OP(rn) == PLUS || OP(rn) == MINMOD || \
 255     OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
 256     (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 257 )
 258 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 259
 260 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 261
 262 #if 0
 263 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 264    we don't need this definition. */
 265 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 266 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 267 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 268
 269 #else
 270 /* ... so we use this as its faster. */
 271 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 272 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 273 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 274
 275 #endif
 276
 277 /*
 278   Search for mandatory following text node; for lookahead, the text must
 279   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 280 */
 281 #define FIND_NEXT_IMPT(rn) STMT_START { \
 282     while (JUMPABLE(rn)) { \
 283         const OPCODE type = OP(rn); \
 284         if (type == SUSPEND || PL_regkind[type] == CURLY) \
 285             rn = NEXTOPER(NEXTOPER(rn)); \
 286         else if (type == PLUS) \
 287             rn = NEXTOPER(rn); \
 288         else if (type == IFMATCH) \
 289             rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 290         else rn += NEXT_OFF(rn); \
 291     } \
 292 } STMT_END
 293
 294
 295 static void restore_pos(pTHX_ void *arg);
 296
 297 STATIC CHECKPOINT
 298 S_regcppush(pTHX_ I32 parenfloor)
 299 {
 300     dVAR;
 301     const int retval = PL_savestack_ix;
 302 #define REGCP_PAREN_ELEMS 4
 303     const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 304     int p;
 305     GET_RE_DEBUG_FLAGS_DECL;
 306
 307     if (paren_elems_to_push < 0)
 308         Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 309
 310 #define REGCP_OTHER_ELEMS 7
 311     SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
 312
 313     for (p = PL_regsize; p > parenfloor; p--) {
 314 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 315         SSPUSHINT(PL_regoffs[p].end);
 316         SSPUSHINT(PL_regoffs[p].start);
 317         SSPUSHPTR(PL_reg_start_tmp[p]);
 318         SSPUSHINT(p);
 319         DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 320           "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 321                       (UV)p, (IV)PL_regoffs[p].start,
 322                       (IV)(PL_reg_start_tmp[p] - PL_bostr),
 323                       (IV)PL_regoffs[p].end
 324         ));
 325     }
 326 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 327     SSPUSHPTR(PL_regoffs);
 328     SSPUSHINT(PL_regsize);
 329     SSPUSHINT(*PL_reglastparen);
 330     SSPUSHINT(*PL_reglastcloseparen);
 331     SSPUSHPTR(PL_reginput);
 332 #define REGCP_FRAME_ELEMS 2
 333 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
 334  * are needed for the regexp context stack bookkeeping. */
 335     SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
 336     SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
 337
 338     return retval;
 339 }
 340
 341 /* These are needed since we do not localize EVAL nodes: */
 342 #define REGCP_SET(cp)                                           \
 343     DEBUG_STATE_r(                                              \
 344             PerlIO_printf(Perl_debug_log,                       \
 345                 "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 346                 (IV)PL_savestack_ix));                          \
 347     cp = PL_savestack_ix
 348
 349 #define REGCP_UNWIND(cp)                                        \
 350     DEBUG_STATE_r(                                              \
 351         if (cp != PL_savestack_ix)                              \
 352             PerlIO_printf(Perl_debug_log,                       \
 353                 "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 354                 (IV)(cp), (IV)PL_savestack_ix));                \
 355     regcpblow(cp)
 356
 357 STATIC char *
 358 S_regcppop(pTHX_ const regexp *rex)
 359 {
 360     dVAR;
 361     U32 i;
 362     char *input;
 363     GET_RE_DEBUG_FLAGS_DECL;
 364
 365     PERL_ARGS_ASSERT_REGCPPOP;
 366
 367     /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 368     i = SSPOPINT;
 369     assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 370     i = SSPOPINT; /* Parentheses elements to pop. */
 371     input = (char *) SSPOPPTR;
 372     *PL_reglastcloseparen = SSPOPINT;
 373     *PL_reglastparen = SSPOPINT;
 374     PL_regsize = SSPOPINT;
 375     PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 376
 377
 378     /* Now restore the parentheses context. */
 379     for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
 380          i > 0; i -= REGCP_PAREN_ELEMS) {
 381         I32 tmps;
 382         U32 paren = (U32)SSPOPINT;
 383         PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 384         PL_regoffs[paren].start = SSPOPINT;
 385         tmps = SSPOPINT;
 386         if (paren <= *PL_reglastparen)
 387             PL_regoffs[paren].end = tmps;
 388         DEBUG_BUFFERS_r(
 389             PerlIO_printf(Perl_debug_log,
 390                           "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 391                           (UV)paren, (IV)PL_regoffs[paren].start,
 392                           (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 393                           (IV)PL_regoffs[paren].end,
 394                           (paren > *PL_reglastparen ? "(no)" : ""));
 395         );
 396     }
 397     DEBUG_BUFFERS_r(
 398         if (*PL_reglastparen + 1 <= rex->nparens) {
 399             PerlIO_printf(Perl_debug_log,
 400                           "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 401                           (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 402         }
 403     );
 404 #if 1
 405     /* It would seem that the similar code in regtry()
 406      * already takes care of this, and in fact it is in
 407      * a better location to since this code can #if 0-ed out
 408      * but the code in regtry() is needed or otherwise tests
 409      * requiring null fields (pat.t#187 and split.t#{13,14}
 410      * (as of patchlevel 7877)  will fail.  Then again,
 411      * this code seems to be necessary or otherwise
 412      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 413      * --jhi updated by dapm */
 414     for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 415         if (i > PL_regsize)
 416             PL_regoffs[i].start = -1;
 417         PL_regoffs[i].end = -1;
 418     }
 419 #endif
 420     return input;
 421 }
 422
 423 #define regcpblow(cp) LEAVE_SCOPE(cp)   /* Ignores regcppush()ed data. */
 424
 425 /*
 426  * pregexec and friends
 427  */
 428
 429 #ifndef PERL_IN_XSUB_RE
 430 /*
 431  - pregexec - match a regexp against a string
 432  */
 433 I32
 434 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 435          char *strbeg, I32 minend, SV *screamer, U32 nosave)
 436 /* strend: pointer to null at end of string */
 437 /* strbeg: real beginning of string */
 438 /* minend: end of match must be >=minend after stringarg. */
 439 /* nosave: For optimizations. */
 440 {
 441     PERL_ARGS_ASSERT_PREGEXEC;
 442
 443     return
 444         regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 445                       nosave ? 0 : REXEC_COPY_STR);
 446 }
 447 #endif
 448
 449 /*
 450  * Need to implement the following flags for reg_anch:
 451  *
 452  * USE_INTUIT_NOML              - Useful to call re_intuit_start() first
 453  * USE_INTUIT_ML
 454  * INTUIT_AUTORITATIVE_NOML     - Can trust a positive answer
 455  * INTUIT_AUTORITATIVE_ML
 456  * INTUIT_ONCE_NOML             - Intuit can match in one location only.
 457  * INTUIT_ONCE_ML
 458  *
 459  * Another flag for this function: SECOND_TIME (so that float substrs
 460  * with giant delta may be not rechecked).
 461  */
 462
 463 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 464
 465 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 466    Otherwise, only SvCUR(sv) is used to get strbeg. */
 467
 468 /* XXXX We assume that strpos is strbeg unless sv. */
 469
 470 /* XXXX Some places assume that there is a fixed substring.
 471         An update may be needed if optimizer marks as "INTUITable"
 472         RExen without fixed substrings.  Similarly, it is assumed that
 473         lengths of all the strings are no more than minlen, thus they
 474         cannot come from lookahead.
 475         (Or minlen should take into account lookahead.)
 476   NOTE: Some of this comment is not correct. minlen does now take account
 477   of lookahead/behind. Further research is required. -- demerphq
 478
 479 */
 480
 481 /* A failure to find a constant substring means that there is no need to make
 482    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 483    finding a substring too deep into the string means that less calls to
 484    regtry() should be needed.
 485
 486    REx compiler's optimizer found 4 possible hints:
 487         a) Anchored substring;
 488         b) Fixed substring;
 489         c) Whether we are anchored (beginning-of-line or \G);
 490         d) First node (of those at offset 0) which may distingush positions;
 491    We use a)b)d) and multiline-part of c), and try to find a position in the
 492    string which does not contradict any of them.
 493  */
 494
 495 /* Most of decisions we do here should have been done at compile time.
 496    The nodes of the REx which we used for the search should have been
 497    deleted from the finite automaton. */
 498
 499 char *
 500 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 501                      char *strend, const U32 flags, re_scream_pos_data *data)
 502 {
 503     dVAR;
 504     struct regexp *const prog = (struct regexp *)SvANY(rx);
 505     register I32 start_shift = 0;
 506     /* Should be nonnegative! */
 507     register I32 end_shift   = 0;
 508     register char *s;
 509     register SV *check;
 510     char *strbeg;
 511     char *t;
 512     const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 513     I32 ml_anch;
 514     register char *other_last = NULL;   /* other substr checked before this */
 515     char *check_at = NULL;              /* check substr found at this pos */
 516     const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 517     RXi_GET_DECL(prog,progi);
 518 #ifdef DEBUGGING
 519     const char * const i_strpos = strpos;
 520 #endif
 521     GET_RE_DEBUG_FLAGS_DECL;
 522
 523     PERL_ARGS_ASSERT_RE_INTUIT_START;
 524
 525     RX_MATCH_UTF8_set(rx,do_utf8);
 526
 527     if (RX_UTF8(rx)) {
 528         PL_reg_flags |= RF_utf8;
 529     }
 530     DEBUG_EXECUTE_r(
 531         debug_start_match(rx, do_utf8, strpos, strend,
 532             sv ? "Guessing start of match in sv for"
 533                : "Guessing start of match in string for");
 534               );
 535
 536     /* CHR_DIST() would be more correct here but it makes things slow. */
 537     if (prog->minlen > strend - strpos) {
 538         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 539                               "String too short... [re_intuit_start]\n"));
 540         goto fail;
 541     }
 542
 543     strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 544     PL_regeol = strend;
 545     if (do_utf8) {
 546         if (!prog->check_utf8 && prog->check_substr)
 547             to_utf8_substr(prog);
 548         check = prog->check_utf8;
 549     } else {
 550         if (!prog->check_substr && prog->check_utf8)
 551             to_byte_substr(prog);
 552         check = prog->check_substr;
 553     }
 554     if (check == &PL_sv_undef) {
 555         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 556                 "Non-utf8 string cannot match utf8 check string\n"));
 557         goto fail;
 558     }
 559     if (prog->extflags & RXf_ANCH) {    /* Match at beg-of-str or after \n */
 560         ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 561                      || ( (prog->extflags & RXf_ANCH_BOL)
 562                           && !multiline ) );    /* Check after \n? */
 563
 564         if (!ml_anch) {
 565           if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 566                 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 567                /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 568                && sv && !SvROK(sv)
 569                && (strpos != strbeg)) {
 570               DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 571               goto fail;
 572           }
 573           if (prog->check_offset_min == prog->check_offset_max &&
 574               !(prog->extflags & RXf_CANY_SEEN)) {
 575             /* Substring at constant offset from beg-of-str... */
 576             I32 slen;
 577
 578             s = HOP3c(strpos, prog->check_offset_min, strend);
 579
 580             if (SvTAIL(check)) {
 581                 slen = SvCUR(check);    /* >= 1 */
 582
 583                 if ( strend - s > slen || strend - s < slen - 1
 584                      || (strend - s == slen && strend[-1] != '\n')) {
 585                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 586                     goto fail_finish;
 587                 }
 588                 /* Now should match s[0..slen-2] */
 589                 slen--;
 590                 if (slen && (*SvPVX_const(check) != *s
 591                              || (slen > 1
 592                                  && memNE(SvPVX_const(check), s, slen)))) {
 593                   report_neq:
 594                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 595                     goto fail_finish;
 596                 }
 597             }
 598             else if (*SvPVX_const(check) != *s
 599                      || ((slen = SvCUR(check)) > 1
 600                          && memNE(SvPVX_const(check), s, slen)))
 601                 goto report_neq;
 602             check_at = s;
 603             goto success_at_start;
 604           }
 605         }
 606         /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 607         s = strpos;
 608         start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 609         end_shift = prog->check_end_shift;
 610
 611         if (!ml_anch) {
 612             const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 613                                          - (SvTAIL(check) != 0);
 614             const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 615
 616             if (end_shift < eshift)
 617                 end_shift = eshift;
 618         }
 619     }
 620     else {                              /* Can match at random position */
 621         ml_anch = 0;
 622         s = strpos;
 623         start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 624         end_shift = prog->check_end_shift;
 625
 626         /* end shift should be non negative here */
 627     }
 628
 629 #ifdef QDEBUGGING       /* 7/99: reports of failure (with the older version) */
 630     if (end_shift < 0)
 631         Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 632                    (IV)end_shift, RX_PRECOMP(prog));
 633 #endif
 634
 635   restart:
 636     /* Find a possible match in the region s..strend by looking for
 637        the "check" substring in the region corrected by start/end_shift. */
 638
 639     {
 640         I32 srch_start_shift = start_shift;
 641         I32 srch_end_shift = end_shift;
 642         if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 643             srch_end_shift -= ((strbeg - s) - srch_start_shift);
 644             srch_start_shift = strbeg - s;
 645         }
 646     DEBUG_OPTIMISE_MORE_r({
 647         PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 648             (IV)prog->check_offset_min,
 649             (IV)srch_start_shift,
 650             (IV)srch_end_shift,
 651             (IV)prog->check_end_shift);
 652     });
 653
 654     if (flags & REXEC_SCREAM) {
 655         I32 p = -1;                     /* Internal iterator of scream. */
 656         I32 * const pp = data ? data->scream_pos : &p;
 657
 658         if (PL_screamfirst[BmRARE(check)] >= 0
 659             || ( BmRARE(check) == '\n'
 660                  && (BmPREVIOUS(check) == SvCUR(check) - 1)
 661                  && SvTAIL(check) ))
 662             s = screaminstr(sv, check,
 663                             srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 664         else
 665             goto fail_finish;
 666         /* we may be pointing at the wrong string */
 667         if (s && RXp_MATCH_COPIED(prog))
 668             s = strbeg + (s - SvPVX_const(sv));
 669         if (data)
 670             *data->scream_olds = s;
 671     }
 672     else {
 673         U8* start_point;
 674         U8* end_point;
 675         if (prog->extflags & RXf_CANY_SEEN) {
 676             start_point= (U8*)(s + srch_start_shift);
 677             end_point= (U8*)(strend - srch_end_shift);
 678         } else {
 679             start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 680             end_point= HOP3(strend, -srch_end_shift, strbeg);
 681         }
 682         DEBUG_OPTIMISE_MORE_r({
 683             PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 684                 (int)(end_point - start_point),
 685                 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 686                 start_point);
 687         });
 688
 689         s = fbm_instr( start_point, end_point,
 690                       check, multiline ? FBMrf_MULTILINE : 0);
 691     }
 692     }
 693     /* Update the count-of-usability, remove useless subpatterns,
 694         unshift s.  */
 695
 696     DEBUG_EXECUTE_r({
 697         RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 698             SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 699         PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 700                           (s ? "Found" : "Did not find"),
 701             (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
 702                 ? "anchored" : "floating"),
 703             quoted,
 704             RE_SV_TAIL(check),
 705             (s ? " at offset " : "...\n") );
 706     });
 707
 708     if (!s)
 709         goto fail_finish;
 710     /* Finish the diagnostic message */
 711     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 712
 713     /* XXX dmq: first branch is for positive lookbehind...
 714        Our check string is offset from the beginning of the pattern.
 715        So we need to do any stclass tests offset forward from that
 716        point. I think. :-(
 717      */
 718
 719
 720
 721     check_at=s;
 722
 723
 724     /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 725        Start with the other substr.
 726        XXXX no SCREAM optimization yet - and a very coarse implementation
 727        XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 728                 *always* match.  Probably should be marked during compile...
 729        Probably it is right to do no SCREAM here...
 730      */
 731
 732     if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
 733                 : (prog->float_substr && prog->anchored_substr))
 734     {
 735         /* Take into account the "other" substring. */
 736         /* XXXX May be hopelessly wrong for UTF... */
 737         if (!other_last)
 738             other_last = strpos;
 739         if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
 740           do_other_anchored:
 741             {
 742                 char * const last = HOP3c(s, -start_shift, strbeg);
 743                 char *last1, *last2;
 744                 char * const saved_s = s;
 745                 SV* must;
 746
 747                 t = s - prog->check_offset_max;
 748                 if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 749                     && (!do_utf8
 750                         || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 751                             && t > strpos)))
 752                     NOOP;
 753                 else
 754                     t = strpos;
 755                 t = HOP3c(t, prog->anchored_offset, strend);
 756                 if (t < other_last)     /* These positions already checked */
 757                     t = other_last;
 758                 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 759                 if (last < last1)
 760                     last1 = last;
 761                 /* XXXX It is not documented what units *_offsets are in.
 762                    We assume bytes, but this is clearly wrong.
 763                    Meaning this code needs to be carefully reviewed for errors.
 764                    dmq.
 765                   */
 766
 767                 /* On end-of-str: see comment below. */
 768                 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
 769                 if (must == &PL_sv_undef) {
 770                     s = (char*)NULL;
 771                     DEBUG_r(must = prog->anchored_utf8);        /* for debug */
 772                 }
 773                 else
 774                     s = fbm_instr(
 775                         (unsigned char*)t,
 776                         HOP3(HOP3(last1, prog->anchored_offset, strend)
 777                                 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 778                         must,
 779                         multiline ? FBMrf_MULTILINE : 0
 780                     );
 781                 DEBUG_EXECUTE_r({
 782                     RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 783                         SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 784                     PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 785                         (s ? "Found" : "Contradicts"),
 786                         quoted, RE_SV_TAIL(must));
 787                 });
 788
 789
 790                 if (!s) {
 791                     if (last1 >= last2) {
 792                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 793                                                 ", giving up...\n"));
 794                         goto fail_finish;
 795                     }
 796                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 797                         ", trying floating at offset %ld...\n",
 798                         (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 799                     other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 800                     s = HOP3c(last, 1, strend);
 801                     goto restart;
 802                 }
 803                 else {
 804                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 805                           (long)(s - i_strpos)));
 806                     t = HOP3c(s, -prog->anchored_offset, strbeg);
 807                     other_last = HOP3c(s, 1, strend);
 808                     s = saved_s;
 809                     if (t == strpos)
 810                         goto try_at_start;
 811                     goto try_at_offset;
 812                 }
 813             }
 814         }
 815         else {          /* Take into account the floating substring. */
 816             char *last, *last1;
 817             char * const saved_s = s;
 818             SV* must;
 819
 820             t = HOP3c(s, -start_shift, strbeg);
 821             last1 = last =
 822                 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 823             if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 824                 last = HOP3c(t, prog->float_max_offset, strend);
 825             s = HOP3c(t, prog->float_min_offset, strend);
 826             if (s < other_last)
 827                 s = other_last;
 828  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 829             must = do_utf8 ? prog->float_utf8 : prog->float_substr;
 830             /* fbm_instr() takes into account exact value of end-of-str
 831                if the check is SvTAIL(ed).  Since false positives are OK,
 832                and end-of-str is not later than strend we are OK. */
 833             if (must == &PL_sv_undef) {
 834                 s = (char*)NULL;
 835                 DEBUG_r(must = prog->float_utf8);       /* for debug message */
 836             }
 837             else
 838                 s = fbm_instr((unsigned char*)s,
 839                               (unsigned char*)last + SvCUR(must)
 840                                   - (SvTAIL(must)!=0),
 841                               must, multiline ? FBMrf_MULTILINE : 0);
 842             DEBUG_EXECUTE_r({
 843                 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 844                     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 845                 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 846                     (s ? "Found" : "Contradicts"),
 847                     quoted, RE_SV_TAIL(must));
 848             });
 849             if (!s) {
 850                 if (last1 == last) {
 851                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 852                                             ", giving up...\n"));
 853                     goto fail_finish;
 854                 }
 855                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 856                     ", trying anchored starting at offset %ld...\n",
 857                     (long)(saved_s + 1 - i_strpos)));
 858                 other_last = last;
 859                 s = HOP3c(t, 1, strend);
 860                 goto restart;
 861             }
 862             else {
 863                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 864                       (long)(s - i_strpos)));
 865                 other_last = s; /* Fix this later. --Hugo */
 866                 s = saved_s;
 867                 if (t == strpos)
 868                     goto try_at_start;
 869                 goto try_at_offset;
 870             }
 871         }
 872     }
 873
 874
 875     t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 876
 877     DEBUG_OPTIMISE_MORE_r(
 878         PerlIO_printf(Perl_debug_log,
 879             "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 880             (IV)prog->check_offset_min,
 881             (IV)prog->check_offset_max,
 882             (IV)(s-strpos),
 883             (IV)(t-strpos),
 884             (IV)(t-s),
 885             (IV)(strend-strpos)
 886         )
 887     );
 888
 889     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 890         && (!do_utf8
 891             || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 892                  && t > strpos)))
 893     {
 894         /* Fixed substring is found far enough so that the match
 895            cannot start at strpos. */
 896       try_at_offset:
 897         if (ml_anch && t[-1] != '\n') {
 898             /* Eventually fbm_*() should handle this, but often
 899                anchored_offset is not 0, so this check will not be wasted. */
 900             /* XXXX In the code below we prefer to look for "^" even in
 901                presence of anchored substrings.  And we search even
 902                beyond the found float position.  These pessimizations
 903                are historical artefacts only.  */
 904           find_anchor:
 905             while (t < strend - prog->minlen) {
 906                 if (*t == '\n') {
 907                     if (t < check_at - prog->check_offset_min) {
 908                         if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
 909                             /* Since we moved from the found position,
 910                                we definitely contradict the found anchored
 911                                substr.  Due to the above check we do not
 912                                contradict "check" substr.
 913                                Thus we can arrive here only if check substr
 914                                is float.  Redo checking for "other"=="fixed".
 915                              */
 916                             strpos = t + 1;
 917                             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 918                                 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 919                             goto do_other_anchored;
 920                         }
 921                         /* We don't contradict the found floating substring. */
 922                         /* XXXX Why not check for STCLASS? */
 923                         s = t + 1;
 924                         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 925                             PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 926                         goto set_useful;
 927                     }
 928                     /* Position contradicts check-string */
 929                     /* XXXX probably better to look for check-string
 930                        than for "\n", so one should lower the limit for t? */
 931                     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 932                         PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 933                     other_last = strpos = s = t + 1;
 934                     goto restart;
 935                 }
 936                 t++;
 937             }
 938             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 939                         PL_colors[0], PL_colors[1]));
 940             goto fail_finish;
 941         }
 942         else {
 943             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 944                         PL_colors[0], PL_colors[1]));
 945         }
 946         s = t;
 947       set_useful:
 948         ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr);    /* hooray/5 */
 949     }
 950     else {
 951         /* The found string does not prohibit matching at strpos,
 952            - no optimization of calling REx engine can be performed,
 953            unless it was an MBOL and we are not after MBOL,
 954            or a future STCLASS check will fail this. */
 955       try_at_start:
 956         /* Even in this situation we may use MBOL flag if strpos is offset
 957            wrt the start of the string. */
 958         if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 959             && (strpos != strbeg) && strpos[-1] != '\n'
 960             /* May be due to an implicit anchor of m{.*foo}  */
 961             && !(prog->intflags & PREGf_IMPLICIT))
 962         {
 963             t = strpos;
 964             goto find_anchor;
 965         }
 966         DEBUG_EXECUTE_r( if (ml_anch)
 967             PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
 968                           (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
 969         );
 970       success_at_start:
 971         if (!(prog->intflags & PREGf_NAUGHTY)   /* XXXX If strpos moved? */
 972             && (do_utf8 ? (
 973                 prog->check_utf8                /* Could be deleted already */
 974                 && --BmUSEFUL(prog->check_utf8) < 0
 975                 && (prog->check_utf8 == prog->float_utf8)
 976             ) : (
 977                 prog->check_substr              /* Could be deleted already */
 978                 && --BmUSEFUL(prog->check_substr) < 0
 979                 && (prog->check_substr == prog->float_substr)
 980             )))
 981         {
 982             /* If flags & SOMETHING - do not do it many times on the same match */
 983             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
 984             /* XXX Does the destruction order has to change with do_utf8? */
 985             SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
 986             SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
 987             prog->check_substr = prog->check_utf8 = NULL;       /* disable */
 988             prog->float_substr = prog->float_utf8 = NULL;       /* clear */
 989             check = NULL;                       /* abort */
 990             s = strpos;
 991             /* XXXX This is a remnant of the old implementation.  It
 992                     looks wasteful, since now INTUIT can use many
 993                     other heuristics. */
 994             prog->extflags &= ~RXf_USE_INTUIT;
 995         }
 996         else
 997             s = strpos;
 998     }
 999
1000     /* Last resort... */
1001     /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1002     /* trie stclasses are too expensive to use here, we are better off to
1003        leave it to regmatch itself */
1004     if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1005         /* minlen == 0 is possible if regstclass is \b or \B,
1006            and the fixed substr is ''$.
1007            Since minlen is already taken into account, s+1 is before strend;
1008            accidentally, minlen >= 1 guaranties no false positives at s + 1
1009            even for \b or \B.  But (minlen? 1 : 0) below assumes that
1010            regstclass does not come from lookahead...  */
1011         /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1012            This leaves EXACTF only, which is dealt with in find_byclass().  */
1013         const U8* const str = (U8*)STRING(progi->regstclass);
1014         const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1015                     ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1016                     : 1);
1017         char * endpos;
1018         if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1019             endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1020         else if (prog->float_substr || prog->float_utf8)
1021             endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1022         else
1023             endpos= strend;
1024
1025         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1026                                       (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1027
1028         t = s;
1029         s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1030         if (!s) {
1031 #ifdef DEBUGGING
1032             const char *what = NULL;
1033 #endif
1034             if (endpos == strend) {
1035                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1036                                 "Could not match STCLASS...\n") );
1037                 goto fail;
1038             }
1039             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1040                                    "This position contradicts STCLASS...\n") );
1041             if ((prog->extflags & RXf_ANCH) && !ml_anch)
1042                 goto fail;
1043             /* Contradict one of substrings */
1044             if (prog->anchored_substr || prog->anchored_utf8) {
1045                 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1046                     DEBUG_EXECUTE_r( what = "anchored" );
1047                   hop_and_restart:
1048                     s = HOP3c(t, 1, strend);
1049                     if (s + start_shift + end_shift > strend) {
1050                         /* XXXX Should be taken into account earlier? */
1051                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1052                                                "Could not match STCLASS...\n") );
1053                         goto fail;
1054                     }
1055                     if (!check)
1056                         goto giveup;
1057                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1058                                 "Looking for %s substr starting at offset %ld...\n",
1059                                  what, (long)(s + start_shift - i_strpos)) );
1060                     goto restart;
1061                 }
1062                 /* Have both, check_string is floating */
1063                 if (t + start_shift >= check_at) /* Contradicts floating=check */
1064                     goto retry_floating_check;
1065                 /* Recheck anchored substring, but not floating... */
1066                 s = check_at;
1067                 if (!check)
1068                     goto giveup;
1069                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1070                           "Looking for anchored substr starting at offset %ld...\n",
1071                           (long)(other_last - i_strpos)) );
1072                 goto do_other_anchored;
1073             }
1074             /* Another way we could have checked stclass at the
1075                current position only: */
1076             if (ml_anch) {
1077                 s = t = t + 1;
1078                 if (!check)
1079                     goto giveup;
1080                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1081                           "Looking for /%s^%s/m starting at offset %ld...\n",
1082                           PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1083                 goto try_at_offset;
1084             }
1085             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))     /* Could have been deleted */
1086                 goto fail;
1087             /* Check is floating subtring. */
1088           retry_floating_check:
1089             t = check_at - start_shift;
1090             DEBUG_EXECUTE_r( what = "floating" );
1091             goto hop_and_restart;
1092         }
1093         if (t != s) {
1094             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1095                         "By STCLASS: moving %ld --> %ld\n",
1096                                   (long)(t - i_strpos), (long)(s - i_strpos))
1097                    );
1098         }
1099         else {
1100             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1101                                   "Does not contradict STCLASS...\n");
1102                    );
1103         }
1104     }
1105   giveup:
1106     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1107                           PL_colors[4], (check ? "Guessed" : "Giving up"),
1108                           PL_colors[5], (long)(s - i_strpos)) );
1109     return s;
1110
1111   fail_finish:                          /* Substring not found */
1112     if (prog->check_substr || prog->check_utf8)         /* could be removed already */
1113         BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1114   fail:
1115     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1116                           PL_colors[4], PL_colors[5]));
1117     return NULL;
1118 }
1119
1120 #define DECL_TRIE_TYPE(scan) \
1121     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1122                     trie_type = (scan->flags != EXACT) \
1123                               ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1124                               : (do_utf8 ? trie_utf8 : trie_plain)
1125
1126 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1127 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1128     switch (trie_type) {                                                    \
1129     case trie_utf8_fold:                                                    \
1130         if ( foldlen>0 ) {                                                  \
1131             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1132             foldlen -= len;                                                 \
1133             uscan += len;                                                   \
1134             len=0;                                                          \
1135         } else {                                                            \
1136             uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1137             uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1138             foldlen -= UNISKIP( uvc );                                      \
1139             uscan = foldbuf + UNISKIP( uvc );                               \
1140         }                                                                   \
1141         break;                                                              \
1142     case trie_latin_utf8_fold:                                              \
1143         if ( foldlen>0 ) {                                                  \
1144             uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1145             foldlen -= len;                                                 \
1146             uscan += len;                                                   \
1147             len=0;                                                          \
1148         } else {                                                            \
1149             len = 1;                                                        \
1150             uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1151             foldlen -= UNISKIP( uvc );                                      \
1152             uscan = foldbuf + UNISKIP( uvc );                               \
1153         }                                                                   \
1154         break;                                                              \
1155     case trie_utf8:                                                         \
1156         uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1157         break;                                                              \
1158     case trie_plain:                                                        \
1159         uvc = (UV)*uc;                                                      \
1160         len = 1;                                                            \
1161     }                                                                       \
1162     if (uvc < 256) {                                                        \
1163         charid = trie->charmap[ uvc ];                                      \
1164     }                                                                       \
1165     else {                                                                  \
1166         charid = 0;                                                         \
1167         if (widecharmap) {                                                  \
1168             SV** const svpp = hv_fetch(widecharmap,                         \
1169                         (char*)&uvc, sizeof(UV), 0);                        \
1170             if (svpp)                                                       \
1171                 charid = (U16)SvIV(*svpp);                                  \
1172         }                                                                   \
1173     }                                                                       \
1174 } STMT_END
1175
1176 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1177 {                                                      \
1178     char *my_strend= (char *)strend;                   \
1179     if ( (CoNd)                                        \
1180          && (ln == len ||                              \
1181              !ibcmp_utf8(s, &my_strend, 0,  do_utf8,   \
1182                         m, NULL, ln, (bool)UTF))       \
1183          && (!reginfo || regtry(reginfo, &s)) )        \
1184         goto got_it;                                   \
1185     else {                                             \
1186          U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1187          uvchr_to_utf8(tmpbuf, c);                     \
1188          f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1189          if ( f != c                                   \
1190               && (f == c1 || f == c2)                  \
1191               && (ln == len ||                         \
1192                 !ibcmp_utf8(s, &my_strend, 0,  do_utf8,\
1193                               m, NULL, ln, (bool)UTF)) \
1194               && (!reginfo || regtry(reginfo, &s)) )   \
1195               goto got_it;                             \
1196     }                                                  \
1197 }                                                      \
1198 s += len
1199
1200 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1201 STMT_START {                                              \
1202     while (s <= e) {                                      \
1203         if ( (CoNd)                                       \
1204              && (ln == 1 || !(OP(c) == EXACTF             \
1205                               ? ibcmp(s, m, ln)           \
1206                               : ibcmp_locale(s, m, ln)))  \
1207              && (!reginfo || regtry(reginfo, &s)) )        \
1208             goto got_it;                                  \
1209         s++;                                              \
1210     }                                                     \
1211 } STMT_END
1212
1213 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1214 STMT_START {                                          \
1215     while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1216         CoDe                                          \
1217         s += uskip;                                   \
1218     }                                                 \
1219 } STMT_END
1220
1221 #define REXEC_FBC_SCAN(CoDe)                          \
1222 STMT_START {                                          \
1223     while (s < strend) {                              \
1224         CoDe                                          \
1225         s++;                                          \
1226     }                                                 \
1227 } STMT_END
1228
1229 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1230 REXEC_FBC_UTF8_SCAN(                                  \
1231     if (CoNd) {                                       \
1232         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1233             goto got_it;                              \
1234         else                                          \
1235             tmp = doevery;                            \
1236     }                                                 \
1237     else                                              \
1238         tmp = 1;                                      \
1239 )
1240
1241 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1242 REXEC_FBC_SCAN(                                       \
1243     if (CoNd) {                                       \
1244         if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1245             goto got_it;                              \
1246         else                                          \
1247             tmp = doevery;                            \
1248     }                                                 \
1249     else                                              \
1250         tmp = 1;                                      \
1251 )
1252
1253 #define REXEC_FBC_TRYIT               \
1254 if ((!reginfo || regtry(reginfo, &s))) \
1255     goto got_it
1256
1257 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1258     if (do_utf8) {                                             \
1259         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1260     }                                                          \
1261     else {                                                     \
1262         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1263     }                                                          \
1264     break
1265
1266 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1267     if (do_utf8) {                                             \
1268         UtFpReLoAd;                                            \
1269         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1270     }                                                          \
1271     else {                                                     \
1272         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1273     }                                                          \
1274     break
1275
1276 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1277     PL_reg_flags |= RF_tainted;                                \
1278     if (do_utf8) {                                             \
1279         REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1280     }                                                          \
1281     else {                                                     \
1282         REXEC_FBC_CLASS_SCAN(CoNd);                            \
1283     }                                                          \
1284     break
1285
1286 #define DUMP_EXEC_POS(li,s,doutf8) \
1287     dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1288
1289 /* We know what class REx starts with.  Try to find this position... */
1290 /* if reginfo is NULL, its a dryrun */
1291 /* annoyingly all the vars in this routine have different names from their counterparts
1292    in regmatch. /grrr */
1293
1294 STATIC char *
1295 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1296     const char *strend, regmatch_info *reginfo)
1297 {
1298         dVAR;
1299         const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1300         char *m;
1301         STRLEN ln;
1302         STRLEN lnc;
1303         register STRLEN uskip;
1304         unsigned int c1;
1305         unsigned int c2;
1306         char *e;
1307         register I32 tmp = 1;   /* Scratch variable? */
1308         register const bool do_utf8 = PL_reg_match_utf8;
1309         RXi_GET_DECL(prog,progi);
1310
1311         PERL_ARGS_ASSERT_FIND_BYCLASS;
1312
1313         /* We know what class it must start with. */
1314         switch (OP(c)) {
1315         case ANYOF:
1316             if (do_utf8) {
1317                  REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1318                           !UTF8_IS_INVARIANT((U8)s[0]) ?
1319                           reginclass(prog, c, (U8*)s, 0, do_utf8) :
1320                           REGINCLASS(prog, c, (U8*)s));
1321             }
1322             else {
1323                  while (s < strend) {
1324                       STRLEN skip = 1;
1325
1326                       if (REGINCLASS(prog, c, (U8*)s) ||
1327                           (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1328                            /* The assignment of 2 is intentional:
1329                             * for the folded sharp s, the skip is 2. */
1330                            (skip = SHARP_S_SKIP))) {
1331                            if (tmp && (!reginfo || regtry(reginfo, &s)))
1332                                 goto got_it;
1333                            else
1334                                 tmp = doevery;
1335                       }
1336                       else
1337                            tmp = 1;
1338                       s += skip;
1339                  }
1340             }
1341             break;
1342         case CANY:
1343             REXEC_FBC_SCAN(
1344                 if (tmp && (!reginfo || regtry(reginfo, &s)))
1345                     goto got_it;
1346                 else
1347                     tmp = doevery;
1348             );
1349             break;
1350         case EXACTF:
1351             m   = STRING(c);
1352             ln  = STR_LEN(c);   /* length to match in octets/bytes */
1353             lnc = (I32) ln;     /* length to match in characters */
1354             if (UTF) {
1355                 STRLEN ulen1, ulen2;
1356                 U8 *sm = (U8 *) m;
1357                 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1358                 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1359                 /* used by commented-out code below */
1360                 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1361
1362                 /* XXX: Since the node will be case folded at compile
1363                    time this logic is a little odd, although im not
1364                    sure that its actually wrong. --dmq */
1365
1366                 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1367                 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1368
1369                 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1370                    codepoint of the first character in the converted
1371                    form, yet originally we did the extra step.
1372                    No tests fail by commenting this code out however
1373                    so Ive left it out. -- dmq.
1374
1375                 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1376                                     0, uniflags);
1377                 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1378                                     0, uniflags);
1379                 */
1380
1381                 lnc = 0;
1382                 while (sm < ((U8 *) m + ln)) {
1383                     lnc++;
1384                     sm += UTF8SKIP(sm);
1385                 }
1386             }
1387             else {
1388                 c1 = *(U8*)m;
1389                 c2 = PL_fold[c1];
1390             }
1391             goto do_exactf;
1392         case EXACTFL:
1393             m   = STRING(c);
1394             ln  = STR_LEN(c);
1395             lnc = (I32) ln;
1396             c1 = *(U8*)m;
1397             c2 = PL_fold_locale[c1];
1398           do_exactf:
1399             e = HOP3c(strend, -((I32)lnc), s);
1400
1401             if (!reginfo && e < s)
1402                 e = s;                  /* Due to minlen logic of intuit() */
1403
1404             /* The idea in the EXACTF* cases is to first find the
1405              * first character of the EXACTF* node and then, if
1406              * necessary, case-insensitively compare the full
1407              * text of the node.  The c1 and c2 are the first
1408              * characters (though in Unicode it gets a bit
1409              * more complicated because there are more cases
1410              * than just upper and lower: one needs to use
1411              * the so-called folding case for case-insensitive
1412              * matching (called "loose matching" in Unicode).
1413              * ibcmp_utf8() will do just that. */
1414
1415             if (do_utf8 || UTF) {
1416                 UV c, f;
1417                 U8 tmpbuf [UTF8_MAXBYTES+1];
1418                 STRLEN len = 1;
1419                 STRLEN foldlen;
1420                 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1421                 if (c1 == c2) {
1422                     /* Upper and lower of 1st char are equal -
1423                      * probably not a "letter". */
1424                     while (s <= e) {
1425                         if (do_utf8) {
1426                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1427                                            uniflags);
1428                         } else {
1429                             c = *((U8*)s);
1430                         }
1431                         REXEC_FBC_EXACTISH_CHECK(c == c1);
1432                     }
1433                 }
1434                 else {
1435                     while (s <= e) {
1436                         if (do_utf8) {
1437                             c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1438                                            uniflags);
1439                         } else {
1440                             c = *((U8*)s);
1441                         }
1442
1443                         /* Handle some of the three Greek sigmas cases.
1444                          * Note that not all the possible combinations
1445                          * are handled here: some of them are handled
1446                          * by the standard folding rules, and some of
1447                          * them (the character class or ANYOF cases)
1448                          * are handled during compiletime in
1449                          * regexec.c:S_regclass(). */
1450                         if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1451                             c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1452                             c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1453
1454                         REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1455                     }
1456                 }
1457             }
1458             else {
1459                 /* Neither pattern nor string are UTF8 */
1460                 if (c1 == c2)
1461                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1462                 else
1463                     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1464             }
1465             break;
1466         case BOUNDL:
1467             PL_reg_flags |= RF_tainted;
1468             /* FALL THROUGH */
1469         case BOUND:
1470             if (do_utf8) {
1471                 if (s == PL_bostr)
1472                     tmp = '\n';
1473                 else {
1474                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1475                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1476                 }
1477                 tmp = ((OP(c) == BOUND ?
1478                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1479                 LOAD_UTF8_CHARCLASS_ALNUM();
1480                 REXEC_FBC_UTF8_SCAN(
1481                     if (tmp == !(OP(c) == BOUND ?
1482                                  (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1483                                  isALNUM_LC_utf8((U8*)s)))
1484                     {
1485                         tmp = !tmp;
1486                         REXEC_FBC_TRYIT;
1487                 }
1488                 );
1489             }
1490             else {
1491                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1492                 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1493                 REXEC_FBC_SCAN(
1494                     if (tmp ==
1495                         !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1496                         tmp = !tmp;
1497                         REXEC_FBC_TRYIT;
1498                 }
1499                 );
1500             }
1501             if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1502                 goto got_it;
1503             break;
1504         case NBOUNDL:
1505             PL_reg_flags |= RF_tainted;
1506             /* FALL THROUGH */
1507         case NBOUND:
1508             if (do_utf8) {
1509                 if (s == PL_bostr)
1510                     tmp = '\n';
1511                 else {
1512                     U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1513                     tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1514                 }
1515                 tmp = ((OP(c) == NBOUND ?
1516                         isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1517                 LOAD_UTF8_CHARCLASS_ALNUM();
1518                 REXEC_FBC_UTF8_SCAN(
1519                     if (tmp == !(OP(c) == NBOUND ?
1520                                  (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1521                                  isALNUM_LC_utf8((U8*)s)))
1522                         tmp = !tmp;
1523                     else REXEC_FBC_TRYIT;
1524                 );
1525             }
1526             else {
1527                 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1528                 tmp = ((OP(c) == NBOUND ?
1529                         isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1530                 REXEC_FBC_SCAN(
1531                     if (tmp ==
1532                         !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1533                         tmp = !tmp;
1534                     else REXEC_FBC_TRYIT;
1535                 );
1536             }
1537             if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1538                 goto got_it;
1539             break;
1540         case ALNUM:
1541             REXEC_FBC_CSCAN_PRELOAD(
1542                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1543                 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1544                 isALNUM(*s)
1545             );
1546         case ALNUML:
1547             REXEC_FBC_CSCAN_TAINT(
1548                 isALNUM_LC_utf8((U8*)s),
1549                 isALNUM_LC(*s)
1550             );
1551         case NALNUM:
1552             REXEC_FBC_CSCAN_PRELOAD(
1553                 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1554                 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1555                 !isALNUM(*s)
1556             );
1557         case NALNUML:
1558             REXEC_FBC_CSCAN_TAINT(
1559                 !isALNUM_LC_utf8((U8*)s),
1560                 !isALNUM_LC(*s)
1561             );
1562         case SPACE:
1563             REXEC_FBC_CSCAN_PRELOAD(
1564                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1565                 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
1566                 isSPACE(*s)
1567             );
1568         case SPACEL:
1569             REXEC_FBC_CSCAN_TAINT(
1570                 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1571                 isSPACE_LC(*s)
1572             );
1573         case NSPACE:
1574             REXEC_FBC_CSCAN_PRELOAD(
1575                 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1576                 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
1577                 !isSPACE(*s)
1578             );
1579         case NSPACEL:
1580             REXEC_FBC_CSCAN_TAINT(
1581                 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1582                 !isSPACE_LC(*s)
1583             );
1584         case DIGIT:
1585             REXEC_FBC_CSCAN_PRELOAD(
1586                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1587                 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1588                 isDIGIT(*s)
1589             );
1590         case DIGITL:
1591             REXEC_FBC_CSCAN_TAINT(
1592                 isDIGIT_LC_utf8((U8*)s),
1593                 isDIGIT_LC(*s)
1594             );
1595         case NDIGIT:
1596             REXEC_FBC_CSCAN_PRELOAD(
1597                 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1598                 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1599                 !isDIGIT(*s)
1600             );
1601         case NDIGITL:
1602             REXEC_FBC_CSCAN_TAINT(
1603                 !isDIGIT_LC_utf8((U8*)s),
1604                 !isDIGIT_LC(*s)
1605             );
1606         case LNBREAK:
1607             REXEC_FBC_CSCAN(
1608                 is_LNBREAK_utf8(s),
1609                 is_LNBREAK_latin1(s)
1610             );
1611         case VERTWS:
1612             REXEC_FBC_CSCAN(
1613                 is_VERTWS_utf8(s),
1614                 is_VERTWS_latin1(s)
1615             );
1616         case NVERTWS:
1617             REXEC_FBC_CSCAN(
1618                 !is_VERTWS_utf8(s),
1619                 !is_VERTWS_latin1(s)
1620             );
1621         case HORIZWS:
1622             REXEC_FBC_CSCAN(
1623                 is_HORIZWS_utf8(s),
1624                 is_HORIZWS_latin1(s)
1625             );
1626         case NHORIZWS:
1627             REXEC_FBC_CSCAN(
1628                 !is_HORIZWS_utf8(s),
1629                 !is_HORIZWS_latin1(s)
1630             );
1631         case AHOCORASICKC:
1632         case AHOCORASICK:
1633             {
1634                 DECL_TRIE_TYPE(c);
1635                 /* what trie are we using right now */
1636                 reg_ac_data *aho
1637                     = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1638                 reg_trie_data *trie
1639                     = (reg_trie_data*)progi->data->data[ aho->trie ];
1640                 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1641
1642                 const char *last_start = strend - trie->minlen;
1643 #ifdef DEBUGGING
1644                 const char *real_start = s;
1645 #endif
1646                 STRLEN maxlen = trie->maxlen;
1647                 SV *sv_points;
1648                 U8 **points; /* map of where we were in the input string
1649                                 when reading a given char. For ASCII this
1650                                 is unnecessary overhead as the relationship
1651                                 is always 1:1, but for Unicode, especially
1652                                 case folded Unicode this is not true. */
1653                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1654                 U8 *bitmap=NULL;
1655
1656
1657                 GET_RE_DEBUG_FLAGS_DECL;
1658
1659                 /* We can't just allocate points here. We need to wrap it in
1660                  * an SV so it gets freed properly if there is a croak while
1661                  * running the match */
1662                 ENTER;
1663                 SAVETMPS;
1664                 sv_points=newSV(maxlen * sizeof(U8 *));
1665                 SvCUR_set(sv_points,
1666                     maxlen * sizeof(U8 *));
1667                 SvPOK_on(sv_points);
1668                 sv_2mortal(sv_points);
1669                 points=(U8**)SvPV_nolen(sv_points );
1670                 if ( trie_type != trie_utf8_fold
1671                      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1672                 {
1673                     if (trie->bitmap)
1674                         bitmap=(U8*)trie->bitmap;
1675                     else
1676                         bitmap=(U8*)ANYOF_BITMAP(c);
1677                 }
1678                 /* this is the Aho-Corasick algorithm modified a touch
1679                    to include special handling for long "unknown char"
1680                    sequences. The basic idea being that we use AC as long
1681                    as we are dealing with a possible matching char, when
1682                    we encounter an unknown char (and we have not encountered
1683                    an accepting state) we scan forward until we find a legal
1684                    starting char.
1685                    AC matching is basically that of trie matching, except
1686                    that when we encounter a failing transition, we fall back
1687                    to the current states "fail state", and try the current char
1688                    again, a process we repeat until we reach the root state,
1689                    state 1, or a legal transition. If we fail on the root state
1690                    then we can either terminate if we have reached an accepting
1691                    state previously, or restart the entire process from the beginning
1692                    if we have not.
1693
1694                  */
1695                 while (s <= last_start) {
1696                     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1697                     U8 *uc = (U8*)s;
1698                     U16 charid = 0;
1699                     U32 base = 1;
1700                     U32 state = 1;
1701                     UV uvc = 0;
1702                     STRLEN len = 0;
1703                     STRLEN foldlen = 0;
1704                     U8 *uscan = (U8*)NULL;
1705                     U8 *leftmost = NULL;
1706 #ifdef DEBUGGING
1707                     U32 accepted_word= 0;
1708 #endif
1709                     U32 pointpos = 0;
1710
1711                     while ( state && uc <= (U8*)strend ) {
1712                         int failed=0;
1713                         U32 word = aho->states[ state ].wordnum;
1714
1715                         if( state==1 ) {
1716                             if ( bitmap ) {
1717                                 DEBUG_TRIE_EXECUTE_r(
1718                                     if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1719                                         dump_exec_pos( (char *)uc, c, strend, real_start,
1720                                             (char *)uc, do_utf8 );
1721                                         PerlIO_printf( Perl_debug_log,
1722                                             " Scanning for legal start char...\n");
1723                                     }
1724                                 );
1725                                 while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1726                                     uc++;
1727                                 }
1728                                 s= (char *)uc;
1729                             }
1730                             if (uc >(U8*)last_start) break;
1731                         }
1732
1733                         if ( word ) {
1734                             U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1735                             if (!leftmost || lpos < leftmost) {
1736                                 DEBUG_r(accepted_word=word);
1737                                 leftmost= lpos;
1738                             }
1739                             if (base==0) break;
1740
1741                         }
1742                         points[pointpos++ % maxlen]= uc;
1743                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1744                                              uscan, len, uvc, charid, foldlen,
1745                                              foldbuf, uniflags);
1746                         DEBUG_TRIE_EXECUTE_r({
1747                             dump_exec_pos( (char *)uc, c, strend, real_start,
1748                                 s,   do_utf8 );
1749                             PerlIO_printf(Perl_debug_log,
1750                                 " Charid:%3u CP:%4"UVxf" ",
1751                                  charid, uvc);
1752                         });
1753
1754                         do {
1755 #ifdef DEBUGGING
1756                             word = aho->states[ state ].wordnum;
1757 #endif
1758                             base = aho->states[ state ].trans.base;
1759
1760                             DEBUG_TRIE_EXECUTE_r({
1761                                 if (failed)
1762                                     dump_exec_pos( (char *)uc, c, strend, real_start,
1763                                         s,   do_utf8 );
1764                                 PerlIO_printf( Perl_debug_log,
1765                                     "%sState: %4"UVxf", word=%"UVxf,
1766                                     failed ? " Fail transition to " : "",
1767                                     (UV)state, (UV)word);
1768                             });
1769                             if ( base ) {
1770                                 U32 tmp;
1771                                 if (charid &&
1772                                      (base + charid > trie->uniquecharcount )
1773                                      && (base + charid - 1 - trie->uniquecharcount
1774                                             < trie->lasttrans)
1775                                      && trie->trans[base + charid - 1 -
1776                                             trie->uniquecharcount].check == state
1777                                      && (tmp=trie->trans[base + charid - 1 -
1778                                         trie->uniquecharcount ].next))
1779                                 {
1780                                     DEBUG_TRIE_EXECUTE_r(
1781                                         PerlIO_printf( Perl_debug_log," - legal\n"));
1782                                     state = tmp;
1783                                     break;
1784                                 }
1785                                 else {
1786                                     DEBUG_TRIE_EXECUTE_r(
1787                                         PerlIO_printf( Perl_debug_log," - fail\n"));
1788                                     failed = 1;
1789                                     state = aho->fail[state];
1790                                 }
1791                             }
1792                             else {
1793                                 /* we must be accepting here */
1794                                 DEBUG_TRIE_EXECUTE_r(
1795                                         PerlIO_printf( Perl_debug_log," - accepting\n"));
1796                                 failed = 1;
1797                                 break;
1798                             }
1799                         } while(state);
1800                         uc += len;
1801                         if (failed) {
1802                             if (leftmost)
1803                                 break;
1804                             if (!state) state = 1;
1805                         }
1806                     }
1807                     if ( aho->states[ state ].wordnum ) {
1808                         U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1809                         if (!leftmost || lpos < leftmost) {
1810                             DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1811                             leftmost = lpos;
1812                         }
1813                     }
1814                     if (leftmost) {
1815                         s = (char*)leftmost;
1816                         DEBUG_TRIE_EXECUTE_r({
1817                             PerlIO_printf(
1818                                 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1819                                 (UV)accepted_word, (IV)(s - real_start)
1820                             );
1821                         });
1822                         if (!reginfo || regtry(reginfo, &s)) {
1823                             FREETMPS;
1824                             LEAVE;
1825                             goto got_it;
1826                         }
1827                         s = HOPc(s,1);
1828                         DEBUG_TRIE_EXECUTE_r({
1829                             PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1830                         });
1831                     } else {
1832                         DEBUG_TRIE_EXECUTE_r(
1833                             PerlIO_printf( Perl_debug_log,"No match.\n"));
1834                         break;
1835                     }
1836                 }
1837                 FREETMPS;
1838                 LEAVE;
1839             }
1840             break;
1841         default:
1842             Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1843             break;
1844         }
1845         return 0;
1846       got_it:
1847         return s;
1848 }
1849
1850
1851 /*
1852  - regexec_flags - match a regexp against a string
1853  */
1854 I32
1855 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1856               char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1857 /* strend: pointer to null at end of string */
1858 /* strbeg: real beginning of string */
1859 /* minend: end of match must be >=minend after stringarg. */
1860 /* data: May be used for some additional optimizations.
1861          Currently its only used, with a U32 cast, for transmitting
1862          the ganch offset when doing a /g match. This will change */
1863 /* nosave: For optimizations. */
1864 {
1865     dVAR;
1866     struct regexp *const prog = (struct regexp *)SvANY(rx);
1867     /*register*/ char *s;
1868     register regnode *c;
1869     /*register*/ char *startpos = stringarg;
1870     I32 minlen;         /* must match at least this many chars */
1871     I32 dontbother = 0; /* how many characters not to try at end */
1872     I32 end_shift = 0;                  /* Same for the end. */         /* CC */
1873     I32 scream_pos = -1;                /* Internal iterator of scream. */
1874     char *scream_olds = NULL;
1875     const bool do_utf8 = (bool)DO_UTF8(sv);
1876     I32 multiline;
1877     RXi_GET_DECL(prog,progi);
1878     regmatch_info reginfo;  /* create some info to pass to regtry etc */
1879     regexp_paren_pair *swap = NULL;
1880     GET_RE_DEBUG_FLAGS_DECL;
1881
1882     PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1883     PERL_UNUSED_ARG(data);
1884
1885     /* Be paranoid... */
1886     if (prog == NULL || startpos == NULL) {
1887         Perl_croak(aTHX_ "NULL regexp parameter");
1888         return 0;
1889     }
1890
1891     multiline = prog->extflags & RXf_PMf_MULTILINE;
1892     reginfo.prog = rx;   /* Yes, sorry that this is confusing.  */
1893
1894     RX_MATCH_UTF8_set(rx, do_utf8);
1895     DEBUG_EXECUTE_r(
1896         debug_start_match(rx, do_utf8, startpos, strend,
1897         "Matching");
1898     );
1899
1900     minlen = prog->minlen;
1901
1902     if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1903         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1904                               "String too short [regexec_flags]...\n"));
1905         goto phooey;
1906     }
1907
1908
1909     /* Check validity of program. */
1910     if (UCHARAT(progi->program) != REG_MAGIC) {
1911         Perl_croak(aTHX_ "corrupted regexp program");
1912     }
1913
1914     PL_reg_flags = 0;
1915     PL_reg_eval_set = 0;
1916     PL_reg_maxiter = 0;
1917
1918     if (RX_UTF8(rx))
1919         PL_reg_flags |= RF_utf8;
1920
1921     /* Mark beginning of line for ^ and lookbehind. */
1922     reginfo.bol = startpos; /* XXX not used ??? */
1923     PL_bostr  = strbeg;
1924     reginfo.sv = sv;
1925
1926     /* Mark end of line for $ (and such) */
1927     PL_regeol = strend;
1928
1929     /* see how far we have to get to not match where we matched before */
1930     reginfo.till = startpos+minend;
1931
1932     /* If there is a "must appear" string, look for it. */
1933     s = startpos;
1934
1935     if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1936         MAGIC *mg;
1937         if (flags & REXEC_IGNOREPOS){   /* Means: check only at start */
1938             reginfo.ganch = startpos + prog->gofs;
1939             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1940               "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1941         } else if (sv && SvTYPE(sv) >= SVt_PVMG
1942                   && SvMAGIC(sv)
1943                   && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1944                   && mg->mg_len >= 0) {
1945             reginfo.ganch = strbeg + mg->mg_len;        /* Defined pos() */
1946             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1947                 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
1948
1949             if (prog->extflags & RXf_ANCH_GPOS) {
1950                 if (s > reginfo.ganch)
1951                     goto phooey;
1952                 s = reginfo.ganch - prog->gofs;
1953                 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1954                      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
1955                 if (s < strbeg)
1956                     goto phooey;
1957             }
1958         }
1959         else if (data) {
1960             reginfo.ganch = strbeg + PTR2UV(data);
1961             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1962                  "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1963
1964         } else {                                /* pos() not defined */
1965             reginfo.ganch = strbeg;
1966             DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1967                  "GPOS: reginfo.ganch = strbeg\n"));
1968         }
1969     }
1970     if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1971         /* We have to be careful. If the previous successful match
1972            was from this regex we don't want a subsequent partially
1973            successful match to clobber the old results.
1974            So when we detect this possibility we add a swap buffer
1975            to the re, and switch the buffer each match. If we fail
1976            we switch it back, otherwise we leave it swapped.
1977         */
1978         swap = prog->offs;
1979         /* do we need a save destructor here for eval dies? */
1980         Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
1981     }
1982     if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1983         re_scream_pos_data d;
1984
1985         d.scream_olds = &scream_olds;
1986         d.scream_pos = &scream_pos;
1987         s = re_intuit_start(rx, sv, s, strend, flags, &d);
1988         if (!s) {
1989             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1990             goto phooey;        /* not present */
1991         }
1992     }
1993
1994
1995
1996     /* Simplest case:  anchored match need be tried only once. */
1997     /*  [unless only anchor is BOL and multiline is set] */
1998     if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1999         if (s == startpos && regtry(&reginfo, &startpos))
2000             goto got_it;
2001         else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2002                  || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2003         {
2004             char *end;
2005
2006             if (minlen)
2007                 dontbother = minlen - 1;
2008             end = HOP3c(strend, -dontbother, strbeg) - 1;
2009             /* for multiline we only have to try after newlines */
2010             if (prog->check_substr || prog->check_utf8) {
2011                 if (s == startpos)
2012                     goto after_try;
2013                 while (1) {
2014                     if (regtry(&reginfo, &s))
2015                         goto got_it;
2016                   after_try:
2017                     if (s > end)
2018                         goto phooey;
2019                     if (prog->extflags & RXf_USE_INTUIT) {
2020                         s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2021                         if (!s)
2022                             goto phooey;
2023                     }
2024                     else
2025                         s++;
2026                 }
2027             } else {
2028                 if (s > startpos)
2029                     s--;
2030                 while (s < end) {
2031                     if (*s++ == '\n') { /* don't need PL_utf8skip here */
2032                         if (regtry(&reginfo, &s))
2033                             goto got_it;
2034                     }
2035                 }
2036             }
2037         }
2038         goto phooey;
2039     } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2040     {
2041         /* the warning about reginfo.ganch being used without intialization
2042            is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2043            and we only enter this block when the same bit is set. */
2044         char *tmp_s = reginfo.ganch - prog->gofs;
2045
2046         if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2047             goto got_it;
2048         goto phooey;
2049     }
2050
2051     /* Messy cases:  unanchored match. */
2052     if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2053         /* we have /x+whatever/ */
2054         /* it must be a one character string (XXXX Except UTF?) */
2055         char ch;
2056 #ifdef DEBUGGING
2057         int did_match = 0;
2058 #endif
2059         if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2060             do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2061         ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
2062
2063         if (do_utf8) {
2064             REXEC_FBC_SCAN(
2065                 if (*s == ch) {
2066                     DEBUG_EXECUTE_r( did_match = 1 );
2067                     if (regtry(&reginfo, &s)) goto got_it;
2068                     s += UTF8SKIP(s);
2069                     while (s < strend && *s == ch)
2070                         s += UTF8SKIP(s);
2071                 }
2072             );
2073         }
2074         else {
2075             REXEC_FBC_SCAN(
2076                 if (*s == ch) {
2077                     DEBUG_EXECUTE_r( did_match = 1 );
2078                     if (regtry(&reginfo, &s)) goto got_it;
2079                     s++;
2080                     while (s < strend && *s == ch)
2081                         s++;
2082                 }
2083             );
2084         }
2085         DEBUG_EXECUTE_r(if (!did_match)
2086                 PerlIO_printf(Perl_debug_log,
2087                                   "Did not find anchored character...\n")
2088                );
2089     }
2090     else if (prog->anchored_substr != NULL
2091               || prog->anchored_utf8 != NULL
2092               || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2093                   && prog->float_max_offset < strend - s)) {
2094         SV *must;
2095         I32 back_max;
2096         I32 back_min;
2097         char *last;
2098         char *last1;            /* Last position checked before */
2099 #ifdef DEBUGGING
2100         int did_match = 0;
2101 #endif
2102         if (prog->anchored_substr || prog->anchored_utf8) {
2103             if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2104                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2105             must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2106             back_max = back_min = prog->anchored_offset;
2107         } else {
2108             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2109                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2110             must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2111             back_max = prog->float_max_offset;
2112             back_min = prog->float_min_offset;
2113         }
2114
2115
2116         if (must == &PL_sv_undef)
2117             /* could not downgrade utf8 check substring, so must fail */
2118             goto phooey;
2119
2120         if (back_min<0) {
2121             last = strend;
2122         } else {
2123             last = HOP3c(strend,        /* Cannot start after this */
2124                   -(I32)(CHR_SVLEN(must)
2125                          - (SvTAIL(must) != 0) + back_min), strbeg);
2126         }
2127         if (s > PL_bostr)
2128             last1 = HOPc(s, -1);
2129         else
2130             last1 = s - 1;      /* bogus */
2131
2132         /* XXXX check_substr already used to find "s", can optimize if
2133            check_substr==must. */
2134         scream_pos = -1;
2135         dontbother = end_shift;
2136         strend = HOPc(strend, -dontbother);
2137         while ( (s <= last) &&
2138                 ((flags & REXEC_SCREAM)
2139                  ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2140                                     end_shift, &scream_pos, 0))
2141                  : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2142                                   (unsigned char*)strend, must,
2143                                   multiline ? FBMrf_MULTILINE : 0))) ) {
2144             /* we may be pointing at the wrong string */
2145             if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2146                 s = strbeg + (s - SvPVX_const(sv));
2147             DEBUG_EXECUTE_r( did_match = 1 );
2148             if (HOPc(s, -back_max) > last1) {
2149                 last1 = HOPc(s, -back_min);
2150                 s = HOPc(s, -back_max);
2151             }
2152             else {
2153                 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2154
2155                 last1 = HOPc(s, -back_min);
2156                 s = t;
2157             }
2158             if (do_utf8) {
2159                 while (s <= last1) {
2160                     if (regtry(&reginfo, &s))
2161                         goto got_it;
2162                     s += UTF8SKIP(s);
2163                 }
2164             }
2165             else {
2166                 while (s <= last1) {
2167                     if (regtry(&reginfo, &s))
2168                         goto got_it;
2169                     s++;
2170                 }
2171             }
2172         }
2173         DEBUG_EXECUTE_r(if (!did_match) {
2174             RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2175                 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2176             PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2177                               ((must == prog->anchored_substr || must == prog->anchored_utf8)
2178                                ? "anchored" : "floating"),
2179                 quoted, RE_SV_TAIL(must));
2180         });
2181         goto phooey;
2182     }
2183     else if ( (c = progi->regstclass) ) {
2184         if (minlen) {
2185             const OPCODE op = OP(progi->regstclass);
2186             /* don't bother with what can't match */
2187             if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2188                 strend = HOPc(strend, -(minlen - 1));
2189         }
2190         DEBUG_EXECUTE_r({
2191             SV * const prop = sv_newmortal();
2192             regprop(prog, prop, c);
2193             {
2194                 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2195                     s,strend-s,60);
2196                 PerlIO_printf(Perl_debug_log,
2197                     "Matching stclass %.*s against %s (%d chars)\n",
2198                     (int)SvCUR(prop), SvPVX_const(prop),
2199                      quoted, (int)(strend - s));
2200             }
2201         });
2202         if (find_byclass(prog, c, s, strend, &reginfo))
2203             goto got_it;
2204         DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2205     }
2206     else {
2207         dontbother = 0;
2208         if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2209             /* Trim the end. */
2210             char *last;
2211             SV* float_real;
2212
2213             if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2214                 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2215             float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2216
2217             if (flags & REXEC_SCREAM) {
2218                 last = screaminstr(sv, float_real, s - strbeg,
2219                                    end_shift, &scream_pos, 1); /* last one */
2220                 if (!last)
2221                     last = scream_olds; /* Only one occurrence. */
2222                 /* we may be pointing at the wrong string */
2223                 else if (RXp_MATCH_COPIED(prog))
2224                     s = strbeg + (s - SvPVX_const(sv));
2225             }
2226             else {
2227                 STRLEN len;
2228                 const char * const little = SvPV_const(float_real, len);
2229
2230                 if (SvTAIL(float_real)) {
2231                     if (memEQ(strend - len + 1, little, len - 1))
2232                         last = strend - len + 1;
2233                     else if (!multiline)
2234                         last = memEQ(strend - len, little, len)
2235                             ? strend - len : NULL;
2236                     else
2237                         goto find_last;
2238                 } else {
2239                   find_last:
2240                     if (len)
2241                         last = rninstr(s, strend, little, little + len);
2242                     else
2243                         last = strend;  /* matching "$" */
2244                 }
2245             }
2246             if (last == NULL) {
2247                 DEBUG_EXECUTE_r(
2248                     PerlIO_printf(Perl_debug_log,
2249                         "%sCan't trim the tail, match fails (should not happen)%s\n",
2250                         PL_colors[4], PL_colors[5]));
2251                 goto phooey; /* Should not happen! */
2252             }
2253             dontbother = strend - last + prog->float_min_offset;
2254         }
2255         if (minlen && (dontbother < minlen))
2256             dontbother = minlen - 1;
2257         strend -= dontbother;              /* this one's always in bytes! */
2258         /* We don't know much -- general case. */
2259         if (do_utf8) {
2260             for (;;) {
2261                 if (regtry(&reginfo, &s))
2262                     goto got_it;
2263                 if (s >= strend)
2264                     break;
2265                 s += UTF8SKIP(s);
2266             };
2267         }
2268         else {
2269             do {
2270                 if (regtry(&reginfo, &s))
2271                     goto got_it;
2272             } while (s++ < strend);
2273         }
2274     }
2275
2276     /* Failure. */
2277     goto phooey;
2278
2279 got_it:
2280     Safefree(swap);
2281     RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2282
2283     if (PL_reg_eval_set)
2284         restore_pos(aTHX_ prog);
2285     if (RXp_PAREN_NAMES(prog))
2286         (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2287
2288     /* make sure $`, $&, $', and $digit will work later */
2289     if ( !(flags & REXEC_NOT_FIRST) ) {
2290         RX_MATCH_COPY_FREE(rx);
2291         if (flags & REXEC_COPY_STR) {
2292             const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2293 #ifdef PERL_OLD_COPY_ON_WRITE
2294             if ((SvIsCOW(sv)
2295                  || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2296                 if (DEBUG_C_TEST) {
2297                     PerlIO_printf(Perl_debug_log,
2298                                   "Copy on write: regexp capture, type %d\n",
2299                                   (int) SvTYPE(sv));
2300                 }
2301                 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2302                 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2303                 assert (SvPOKp(prog->saved_copy));
2304             } else
2305 #endif
2306             {
2307                 RX_MATCH_COPIED_on(rx);
2308                 s = savepvn(strbeg, i);
2309                 prog->subbeg = s;
2310             }
2311             prog->sublen = i;
2312         }
2313         else {
2314             prog->subbeg = strbeg;
2315             prog->sublen = PL_regeol - strbeg;  /* strend may have been modified */
2316         }
2317     }
2318
2319     return 1;
2320
2321 phooey:
2322     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2323                           PL_colors[4], PL_colors[5]));
2324     if (PL_reg_eval_set)
2325         restore_pos(aTHX_ prog);
2326     if (swap) {
2327         /* we failed :-( roll it back */
2328         Safefree(prog->offs);
2329         prog->offs = swap;
2330     }
2331
2332     return 0;
2333 }
2334
2335
2336 /*
2337  - regtry - try match at specific point
2338  */
2339 STATIC I32                      /* 0 failure, 1 success */
2340 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2341 {
2342     dVAR;
2343     CHECKPOINT lastcp;
2344     REGEXP *const rx = reginfo->prog;
2345     regexp *const prog = (struct regexp *)SvANY(rx);
2346     RXi_GET_DECL(prog,progi);
2347     GET_RE_DEBUG_FLAGS_DECL;
2348
2349     PERL_ARGS_ASSERT_REGTRY;
2350
2351     reginfo->cutpoint=NULL;
2352
2353     if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2354         MAGIC *mg;
2355
2356         PL_reg_eval_set = RS_init;
2357         DEBUG_EXECUTE_r(DEBUG_s(
2358             PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2359                           (IV)(PL_stack_sp - PL_stack_base));
2360             ));
2361         SAVESTACK_CXPOS();
2362         cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2363         /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2364         SAVETMPS;
2365         /* Apparently this is not needed, judging by wantarray. */
2366         /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2367            cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2368
2369         if (reginfo->sv) {
2370             /* Make $_ available to executed code. */
2371             if (reginfo->sv != DEFSV) {
2372                 SAVE_DEFSV;
2373                 DEFSV_set(reginfo->sv);
2374             }
2375
2376             if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2377                   && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2378                 /* prepare for quick setting of pos */
2379 #ifdef PERL_OLD_COPY_ON_WRITE
2380                 if (SvIsCOW(reginfo->sv))
2381                     sv_force_normal_flags(reginfo->sv, 0);
2382 #endif
2383                 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2384                                  &PL_vtbl_mglob, NULL, 0);
2385                 mg->mg_len = -1;
2386             }
2387             PL_reg_magic    = mg;
2388             PL_reg_oldpos   = mg->mg_len;
2389             SAVEDESTRUCTOR_X(restore_pos, prog);
2390         }
2391         if (!PL_reg_curpm) {
2392             Newxz(PL_reg_curpm, 1, PMOP);
2393 #ifdef USE_ITHREADS
2394             {
2395                 SV* const repointer = &PL_sv_undef;
2396                 /* this regexp is also owned by the new PL_reg_curpm, which
2397                    will try to free it.  */
2398                 av_push(PL_regex_padav, repointer);
2399                 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2400                 PL_regex_pad = AvARRAY(PL_regex_padav);
2401             }
2402 #endif
2403         }
2404 #ifdef USE_ITHREADS
2405         /* It seems that non-ithreads works both with and without this code.
2406            So for efficiency reasons it seems best not to have the code
2407            compiled when it is not needed.  */
2408         /* This is safe against NULLs: */
2409         ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2410         /* PM_reg_curpm owns a reference to this regexp.  */
2411         ReREFCNT_inc(rx);
2412 #endif
2413         PM_SETRE(PL_reg_curpm, rx);
2414         PL_reg_oldcurpm = PL_curpm;
2415         PL_curpm = PL_reg_curpm;
2416         if (RXp_MATCH_COPIED(prog)) {
2417             /*  Here is a serious problem: we cannot rewrite subbeg,
2418                 since it may be needed if this match fails.  Thus
2419                 $` inside (?{}) could fail... */
2420             PL_reg_oldsaved = prog->subbeg;
2421             PL_reg_oldsavedlen = prog->sublen;
2422 #ifdef PERL_OLD_COPY_ON_WRITE
2423             PL_nrs = prog->saved_copy;
2424 #endif
2425             RXp_MATCH_COPIED_off(prog);
2426         }
2427         else
2428             PL_reg_oldsaved = NULL;
2429         prog->subbeg = PL_bostr;
2430         prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2431     }
2432     DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2433     prog->offs[0].start = *startpos - PL_bostr;
2434     PL_reginput = *startpos;
2435     PL_reglastparen = &prog->lastparen;
2436     PL_reglastcloseparen = &prog->lastcloseparen;
2437     prog->lastparen = 0;
2438     prog->lastcloseparen = 0;
2439     PL_regsize = 0;
2440     PL_regoffs = prog->offs;
2441     if (PL_reg_start_tmpl <= prog->nparens) {
2442         PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2443         if(PL_reg_start_tmp)
2444             Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2445         else
2446             Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2447     }
2448
2449     /* XXXX What this code is doing here?!!!  There should be no need
2450        to do this again and again, PL_reglastparen should take care of
2451        this!  --ilya*/
2452
2453     /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2454      * Actually, the code in regcppop() (which Ilya may be meaning by
2455      * PL_reglastparen), is not needed at all by the test suite
2456      * (op/regexp, op/pat, op/split), but that code is needed otherwise
2457      * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2458      * Meanwhile, this code *is* needed for the
2459      * above-mentioned test suite tests to succeed.  The common theme
2460      * on those tests seems to be returning null fields from matches.
2461      * --jhi updated by dapm */
2462 #if 1
2463     if (prog->nparens) {
2464         regexp_paren_pair *pp = PL_regoffs;
2465         register I32 i;
2466         for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2467             ++pp;
2468             pp->start = -1;
2469             pp->end = -1;
2470         }
2471     }
2472 #endif
2473     REGCP_SET(lastcp);
2474     if (regmatch(reginfo, progi->program + 1)) {
2475         PL_regoffs[0].end = PL_reginput - PL_bostr;
2476         return 1;
2477     }
2478     if (reginfo->cutpoint)
2479         *startpos= reginfo->cutpoint;
2480     REGCP_UNWIND(lastcp);
2481     return 0;
2482 }
2483
2484
2485 #define sayYES goto yes
2486 #define sayNO goto no
2487 #define sayNO_SILENT goto no_silent
2488
2489 /* we dont use STMT_START/END here because it leads to
2490    "unreachable code" warnings, which are bogus, but distracting. */
2491 #define CACHEsayNO \
2492     if (ST.cache_mask) \
2493        PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2494     sayNO
2495
2496 /* this is used to determine how far from the left messages like
2497    'failed...' are printed. It should be set such that messages
2498    are inline with the regop output that created them.
2499 */
2500 #define REPORT_CODE_OFF 32
2501
2502
2503 /* Make sure there is a test for this +1 options in re_tests */
2504 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2505
2506 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2507 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2508
2509 #define SLAB_FIRST(s) (&(s)->states[0])
2510 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2511
2512 /* grab a new slab and return the first slot in it */
2513
2514 STATIC regmatch_state *
2515 S_push_slab(pTHX)
2516 {
2517 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2518     dMY_CXT;
2519 #endif
2520     regmatch_slab *s = PL_regmatch_slab->next;
2521     if (!s) {
2522         Newx(s, 1, regmatch_slab);
2523         s->prev = PL_regmatch_slab;
2524         s->next = NULL;
2525         PL_regmatch_slab->next = s;
2526     }
2527     PL_regmatch_slab = s;
2528     return SLAB_FIRST(s);
2529 }
2530
2531
2532 /* push a new state then goto it */
2533
2534 #define PUSH_STATE_GOTO(state, node) \
2535     scan = node; \
2536     st->resume_state = state; \
2537     goto push_state;
2538
2539 /* push a new state with success backtracking, then goto it */
2540
2541 #define PUSH_YES_STATE_GOTO(state, node) \
2542     scan = node; \
2543     st->resume_state = state; \
2544     goto push_yes_state;
2545
2546
2547
2548 /*
2549
2550 regmatch() - main matching routine
2551
2552 This is basically one big switch statement in a loop. We execute an op,
2553 set 'next' to point the next op, and continue. If we come to a point which
2554 we may need to backtrack to on failure such as (A|B|C), we push a
2555 backtrack state onto the backtrack stack. On failure, we pop the top
2556 state, and re-enter the loop at the state indicated. If there are no more
2557 states to pop, we return failure.
2558
2559 Sometimes we also need to backtrack on success; for example /A+/, where
2560 after successfully matching one A, we need to go back and try to
2561 match another one; similarly for lookahead assertions: if the assertion
2562 completes successfully, we backtrack to the state just before the assertion
2563 and then carry on.  In these cases, the pushed state is marked as
2564 'backtrack on success too'. This marking is in fact done by a chain of
2565 pointers, each pointing to the previous 'yes' state. On success, we pop to
2566 the nearest yes state, discarding any intermediate failure-only states.
2567 Sometimes a yes state is pushed just to force some cleanup code to be
2568 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2569 it to free the inner regex.
2570
2571 Note that failure backtracking rewinds the cursor position, while
2572 success backtracking leaves it alone.
2573
2574 A pattern is complete when the END op is executed, while a subpattern
2575 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2576 ops trigger the "pop to last yes state if any, otherwise return true"
2577 behaviour.
2578
2579 A common convention in this function is to use A and B to refer to the two
2580 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2581 the subpattern to be matched possibly multiple times, while B is the entire
2582 rest of the pattern. Variable and state names reflect this convention.
2583
2584 The states in the main switch are the union of ops and failure/success of
2585 substates associated with with that op.  For example, IFMATCH is the op
2586 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2587 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2588 successfully matched A and IFMATCH_A_fail is a state saying that we have
2589 just failed to match A. Resume states always come in pairs. The backtrack
2590 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2591 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2592 on success or failure.
2593
2594 The struct that holds a backtracking state is actually a big union, with
2595 one variant for each major type of op. The variable st points to the
2596 top-most backtrack struct. To make the code clearer, within each
2597 block of code we #define ST to alias the relevant union.
2598
2599 Here's a concrete example of a (vastly oversimplified) IFMATCH
2600 implementation:
2601
2602     switch (state) {
2603     ....
2604
2605 #define ST st->u.ifmatch
2606
2607     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2608         ST.foo = ...; // some state we wish to save
2609         ...
2610         // push a yes backtrack state with a resume value of
2611         // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2612         // first node of A:
2613         PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2614         // NOTREACHED
2615
2616     case IFMATCH_A: // we have successfully executed A; now continue with B
2617         next = B;
2618         bar = ST.foo; // do something with the preserved value
2619         break;
2620
2621     case IFMATCH_A_fail: // A failed, so the assertion failed
2622         ...;   // do some housekeeping, then ...
2623         sayNO; // propagate the failure
2624
2625 #undef ST
2626
2627     ...
2628     }
2629
2630 For any old-timers reading this who are familiar with the old recursive
2631 approach, the code above is equivalent to:
2632
2633     case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2634     {
2635         int foo = ...
2636         ...
2637         if (regmatch(A)) {
2638             next = B;
2639             bar = foo;
2640             break;
2641         }
2642         ...;   // do some housekeeping, then ...
2643         sayNO; // propagate the failure
2644     }
2645
2646 The topmost backtrack state, pointed to by st, is usually free. If you
2647 want to claim it, populate any ST.foo fields in it with values you wish to
2648 save, then do one of
2649
2650         PUSH_STATE_GOTO(resume_state, node);
2651         PUSH_YES_STATE_GOTO(resume_state, node);
2652
2653 which sets that backtrack state's resume value to 'resume_state', pushes a
2654 new free entry to the top of the backtrack stack, then goes to 'node'.
2655 On backtracking, the free slot is popped, and the saved state becomes the
2656 new free state. An ST.foo field in this new top state can be temporarily
2657 accessed to retrieve values, but once the main loop is re-entered, it
2658 becomes available for reuse.
2659
2660 Note that the depth of the backtrack stack constantly increases during the
2661 left-to-right execution of the pattern, rather than going up and down with
2662 the pattern nesting. For example the stack is at its maximum at Z at the
2663 end of the pattern, rather than at X in the following:
2664
2665     /(((X)+)+)+....(Y)+....Z/
2666
2667 The only exceptions to this are lookahead/behind assertions and the cut,
2668 (?>A), which pop all the backtrack states associated with A before
2669 continuing.
2670
2671 Bascktrack state structs are allocated in slabs of about 4K in size.
2672 PL_regmatch_state and st always point to the currently active state,
2673 and PL_regmatch_slab points to the slab currently containing
2674 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2675 allocated, and is never freed until interpreter destruction. When the slab
2676 is full, a new one is allocated and chained to the end. At exit from
2677 regmatch(), slabs allocated since entry are freed.
2678
2679 */
2680
2681
2682 #define DEBUG_STATE_pp(pp)                                  \
2683     DEBUG_STATE_r({                                         \
2684         DUMP_EXEC_POS(locinput, scan, do_utf8);             \
2685         PerlIO_printf(Perl_debug_log,                       \
2686             "    %*s"pp" %s%s%s%s%s\n",                     \
2687             depth*2, "",                                    \
2688             PL_reg_name[st->resume_state],                     \
2689             ((st==yes_state||st==mark_state) ? "[" : ""),   \
2690             ((st==yes_state) ? "Y" : ""),                   \
2691             ((st==mark_state) ? "M" : ""),                  \
2692             ((st==yes_state||st==mark_state) ? "]" : "")    \
2693         );                                                  \
2694     });
2695
2696
2697 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2698
2699 #ifdef DEBUGGING
2700
2701 STATIC void
2702 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2703     const char *start, const char *end, const char *blurb)
2704 {
2705     const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2706
2707     PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2708
2709     if (!PL_colorset)
2710             reginitcolors();
2711     {
2712         RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2713             RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2714
2715         RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2716             start, end - start, 60);
2717
2718         PerlIO_printf(Perl_debug_log,
2719             "%s%s REx%s %s against %s\n",
2720                        PL_colors[4], blurb, PL_colors[5], s0, s1);
2721
2722         if (do_utf8||utf8_pat)
2723             PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2724                 utf8_pat ? "pattern" : "",
2725                 utf8_pat && do_utf8 ? " and " : "",
2726                 do_utf8 ? "string" : ""
2727             );
2728     }
2729 }
2730
2731 STATIC void
2732 S_dump_exec_pos(pTHX_ const char *locinput,
2733                       const regnode *scan,
2734                       const char *loc_regeol,
2735                       const char *loc_bostr,
2736                       const char *loc_reg_starttry,
2737                       const bool do_utf8)
2738 {
2739     const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2740     const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2741     int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2742     /* The part of the string before starttry has one color
2743        (pref0_len chars), between starttry and current
2744        position another one (pref_len - pref0_len chars),
2745        after the current position the third one.
2746        We assume that pref0_len <= pref_len, otherwise we
2747        decrease pref0_len.  */
2748     int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2749         ? (5 + taill) - l : locinput - loc_bostr;
2750     int pref0_len;
2751
2752     PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2753
2754     while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2755         pref_len++;
2756     pref0_len = pref_len  - (locinput - loc_reg_starttry);
2757     if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2758         l = ( loc_regeol - locinput > (5 + taill) - pref_len
2759               ? (5 + taill) - pref_len : loc_regeol - locinput);
2760     while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2761         l--;
2762     if (pref0_len < 0)
2763         pref0_len = 0;
2764     if (pref0_len > pref_len)
2765         pref0_len = pref_len;
2766     {
2767         const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2768
2769         RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2770             (locinput - pref_len),pref0_len, 60, 4, 5);
2771
2772         RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2773                     (locinput - pref_len + pref0_len),
2774                     pref_len - pref0_len, 60, 2, 3);
2775
2776         RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2777                     locinput, loc_regeol - locinput, 10, 0, 1);
2778
2779         const STRLEN tlen=len0+len1+len2;
2780         PerlIO_printf(Perl_debug_log,
2781                     "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2782                     (IV)(locinput - loc_bostr),
2783                     len0, s0,
2784                     len1, s1,
2785                     (docolor ? "" : "> <"),
2786                     len2, s2,
2787                     (int)(tlen > 19 ? 0 :  19 - tlen),
2788                     "");
2789     }
2790 }
2791
2792 #endif
2793
2794 /* reg_check_named_buff_matched()
2795  * Checks to see if a named buffer has matched. The data array of
2796  * buffer numbers corresponding to the buffer is expected to reside
2797  * in the regexp->data->data array in the slot stored in the ARG() of
2798  * node involved. Note that this routine doesn't actually care about the
2799  * name, that information is not preserved from compilation to execution.
2800  * Returns the index of the leftmost defined buffer with the given name
2801  * or 0 if non of the buffers matched.
2802  */
2803 STATIC I32
2804 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2805 {
2806     I32 n;
2807     RXi_GET_DECL(rex,rexi);
2808     SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2809     I32 *nums=(I32*)SvPVX(sv_dat);
2810
2811     PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2812
2813     for ( n=0; n<SvIVX(sv_dat); n++ ) {
2814         if ((I32)*PL_reglastparen >= nums[n] &&
2815             PL_regoffs[nums[n]].end != -1)
2816         {
2817             return nums[n];
2818         }
2819     }
2820     return 0;
2821 }
2822
2823
2824 /* free all slabs above current one  - called during LEAVE_SCOPE */
2825
2826 STATIC void
2827 S_clear_backtrack_stack(pTHX_ void *p)
2828 {
2829     regmatch_slab *s = PL_regmatch_slab->next;
2830     PERL_UNUSED_ARG(p);
2831
2832     if (!s)
2833         return;
2834     PL_regmatch_slab->next = NULL;
2835     while (s) {
2836         regmatch_slab * const osl = s;
2837         s = s->next;
2838         Safefree(osl);
2839     }
2840 }
2841
2842
2843 #define SETREX(Re1,Re2) \
2844     if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2845     Re1 = (Re2)
2846
2847 STATIC I32                      /* 0 failure, 1 success */
2848 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2849 {
2850 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2851     dMY_CXT;
2852 #endif
2853     dVAR;
2854     register const bool do_utf8 = PL_reg_match_utf8;
2855     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2856     REGEXP *rex_sv = reginfo->prog;
2857     regexp *rex = (struct regexp *)SvANY(rex_sv);
2858     RXi_GET_DECL(rex,rexi);
2859     I32 oldsave;
2860     /* the current state. This is a cached copy of PL_regmatch_state */
2861     register regmatch_state *st;
2862     /* cache heavy used fields of st in registers */
2863     register regnode *scan;
2864     register regnode *next;
2865     register U32 n = 0; /* general value; init to avoid compiler warning */
2866     register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2867     register char *locinput = PL_reginput;
2868     register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2869
2870     bool result = 0;        /* return value of S_regmatch */
2871     int depth = 0;          /* depth of backtrack stack */
2872     U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2873     const U32 max_nochange_depth =
2874         (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2875         3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2876     regmatch_state *yes_state = NULL; /* state to pop to on success of
2877                                                             subpattern */
2878     /* mark_state piggy backs on the yes_state logic so that when we unwind
2879        the stack on success we can update the mark_state as we go */
2880     regmatch_state *mark_state = NULL; /* last mark state we have seen */
2881     regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2882     struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2883     U32 state_num;
2884     bool no_final = 0;      /* prevent failure from backtracking? */
2885     bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2886     char *startpoint = PL_reginput;
2887     SV *popmark = NULL;     /* are we looking for a mark? */
2888     SV *sv_commit = NULL;   /* last mark name seen in failure */
2889     SV *sv_yes_mark = NULL; /* last mark name we have seen
2890                                during a successfull match */
2891     U32 lastopen = 0;       /* last open we saw */
2892     bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2893     SV* const oreplsv = GvSV(PL_replgv);
2894     /* these three flags are set by various ops to signal information to
2895      * the very next op. They have a useful lifetime of exactly one loop
2896      * iteration, and are not preserved or restored by state pushes/pops
2897      */
2898     bool sw = 0;            /* the condition value in (?(cond)a|b) */
2899     bool minmod = 0;        /* the next "{n,m}" is a "{n,m}?" */
2900     int logical = 0;        /* the following EVAL is:
2901                                 0: (?{...})
2902                                 1: (?(?{...})X|Y)
2903                                 2: (??{...})
2904                                or the following IFMATCH/UNLESSM is:
2905                                 false: plain (?=foo)
2906                                 true:  used as a condition: (?(?=foo))
2907                             */
2908 #ifdef DEBUGGING
2909     GET_RE_DEBUG_FLAGS_DECL;
2910 #endif
2911
2912     PERL_ARGS_ASSERT_REGMATCH;
2913
2914     DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2915             PerlIO_printf(Perl_debug_log,"regmatch start\n");
2916     }));
2917     /* on first ever call to regmatch, allocate first slab */
2918     if (!PL_regmatch_slab) {
2919         Newx(PL_regmatch_slab, 1, regmatch_slab);
2920         PL_regmatch_slab->prev = NULL;
2921         PL_regmatch_slab->next = NULL;
2922         PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2923     }
2924
2925     oldsave = PL_savestack_ix;
2926     SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2927     SAVEVPTR(PL_regmatch_slab);
2928     SAVEVPTR(PL_regmatch_state);
2929
2930     /* grab next free state slot */
2931     st = ++PL_regmatch_state;
2932     if (st >  SLAB_LAST(PL_regmatch_slab))
2933         st = PL_regmatch_state = S_push_slab(aTHX);
2934
2935     /* Note that nextchr is a byte even in UTF */
2936     nextchr = UCHARAT(locinput);
2937     scan = prog;
2938     while (scan != NULL) {
2939
2940         DEBUG_EXECUTE_r( {
2941             SV * const prop = sv_newmortal();
2942             regnode *rnext=regnext(scan);
2943             DUMP_EXEC_POS( locinput, scan, do_utf8 );
2944             regprop(rex, prop, scan);
2945
2946             PerlIO_printf(Perl_debug_log,
2947                     "%3"IVdf":%*s%s(%"IVdf")\n",
2948                     (IV)(scan - rexi->program), depth*2, "",
2949                     SvPVX_const(prop),
2950                     (PL_regkind[OP(scan)] == END || !rnext) ?
2951                         0 : (IV)(rnext - rexi->program));
2952         });
2953
2954         next = scan + NEXT_OFF(scan);
2955         if (next == scan)
2956             next = NULL;
2957         state_num = OP(scan);
2958
2959       reenter_switch:
2960
2961         assert(PL_reglastparen == &rex->lastparen);
2962         assert(PL_reglastcloseparen == &rex->lastcloseparen);
2963         assert(PL_regoffs == rex->offs);
2964
2965         switch (state_num) {
2966         case BOL:
2967             if (locinput == PL_bostr)
2968             {
2969                 /* reginfo->till = reginfo->bol; */
2970                 break;
2971             }
2972             sayNO;
2973         case MBOL:
2974             if (locinput == PL_bostr ||
2975                 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2976             {
2977                 break;
2978             }
2979             sayNO;
2980         case SBOL:
2981             if (locinput == PL_bostr)
2982                 break;
2983             sayNO;
2984         case GPOS:
2985             if (locinput == reginfo->ganch)
2986                 break;
2987             sayNO;
2988
2989         case KEEPS:
2990             /* update the startpoint */
2991             st->u.keeper.val = PL_regoffs[0].start;
2992             PL_reginput = locinput;
2993             PL_regoffs[0].start = locinput - PL_bostr;
2994             PUSH_STATE_GOTO(KEEPS_next, next);
2995             /*NOT-REACHED*/
2996         case KEEPS_next_fail:
2997             /* rollback the start point change */
2998             PL_regoffs[0].start = st->u.keeper.val;
2999             sayNO_SILENT;
3000             /*NOT-REACHED*/
3001         case EOL:
3002                 goto seol;
3003         case MEOL:
3004             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3005                 sayNO;
3006             break;
3007         case SEOL:
3008           seol:
3009             if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3010                 sayNO;
3011             if (PL_regeol - locinput > 1)
3012                 sayNO;
3013             break;
3014         case EOS:
3015             if (PL_regeol != locinput)
3016                 sayNO;
3017             break;
3018         case SANY:
3019             if (!nextchr && locinput >= PL_regeol)
3020                 sayNO;
3021             if (do_utf8) {
3022                 locinput += PL_utf8skip[nextchr];
3023                 if (locinput > PL_regeol)
3024                     sayNO;
3025                 nextchr = UCHARAT(locinput);
3026             }
3027             else
3028                 nextchr = UCHARAT(++locinput);
3029             break;
3030         case CANY:
3031             if (!nextchr && locinput >= PL_regeol)
3032                 sayNO;
3033             nextchr = UCHARAT(++locinput);
3034             break;
3035         case REG_ANY:
3036             if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3037                 sayNO;
3038             if (do_utf8) {
3039                 locinput += PL_utf8skip[nextchr];
3040                 if (locinput > PL_regeol)
3041                     sayNO;
3042                 nextchr = UCHARAT(locinput);
3043             }
3044             else
3045                 nextchr = UCHARAT(++locinput);
3046             break;
3047
3048 #undef  ST
3049 #define ST st->u.trie
3050         case TRIEC:
3051             /* In this case the charclass data is available inline so
3052                we can fail fast without a lot of extra overhead.
3053              */
3054             if (scan->flags == EXACT || !do_utf8) {
3055                 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3056                     DEBUG_EXECUTE_r(
3057                         PerlIO_printf(Perl_debug_log,
3058                                   "%*s  %sfailed to match trie start class...%s\n",
3059                                   REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3060                     );
3061                     sayNO_SILENT;
3062                     /* NOTREACHED */
3063                 }
3064             }
3065             /* FALL THROUGH */
3066         case TRIE:
3067             {
3068                 /* what type of TRIE am I? (utf8 makes this contextual) */
3069                 DECL_TRIE_TYPE(scan);
3070
3071                 /* what trie are we using right now */
3072                 reg_trie_data * const trie
3073                     = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3074                 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3075                 U32 state = trie->startstate;
3076
3077                 if (trie->bitmap && trie_type != trie_utf8_fold &&
3078                     !TRIE_BITMAP_TEST(trie,*locinput)
3079                 ) {
3080                     if (trie->states[ state ].wordnum) {
3081                          DEBUG_EXECUTE_r(
3082                             PerlIO_printf(Perl_debug_log,
3083                                           "%*s  %smatched empty string...%s\n",
3084                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3085                         );
3086                         break;
3087                     } else {
3088                         DEBUG_EXECUTE_r(
3089                             PerlIO_printf(Perl_debug_log,
3090                                           "%*s  %sfailed to match trie start class...%s\n",
3091                                           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3092                         );
3093                         sayNO_SILENT;
3094                    }
3095                 }
3096
3097             {
3098                 U8 *uc = ( U8* )locinput;
3099
3100                 STRLEN len = 0;
3101                 STRLEN foldlen = 0;
3102                 U8 *uscan = (U8*)NULL;
3103                 STRLEN bufflen=0;
3104                 SV *sv_accept_buff = NULL;
3105                 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3106
3107                 ST.accepted = 0; /* how many accepting states we have seen */
3108                 ST.B = next;
3109                 ST.jump = trie->jump;
3110                 ST.me = scan;
3111                 /*
3112                    traverse the TRIE keeping track of all accepting states
3113                    we transition through until we get to a failing node.
3114                 */
3115
3116                 while ( state && uc <= (U8*)PL_regeol ) {
3117                     U32 base = trie->states[ state ].trans.base;
3118                     UV uvc = 0;
3119                     U16 charid;
3120                     /* We use charid to hold the wordnum as we don't use it
3121                        for charid until after we have done the wordnum logic.
3122                        We define an alias just so that the wordnum logic reads
3123                        more naturally. */
3124
3125 #define got_wordnum charid
3126                     got_wordnum = trie->states[ state ].wordnum;
3127
3128                     if ( got_wordnum ) {
3129                         if ( ! ST.accepted ) {
3130                             ENTER;
3131                             SAVETMPS; /* XXX is this necessary? dmq */
3132                             bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3133                             sv_accept_buff=newSV(bufflen *
3134                                             sizeof(reg_trie_accepted) - 1);
3135                             SvCUR_set(sv_accept_buff, 0);
3136                             SvPOK_on(sv_accept_buff);
3137                             sv_2mortal(sv_accept_buff);
3138                             SAVETMPS;
3139                             ST.accept_buff =
3140                                 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3141                         }
3142                         do {
3143                             if (ST.accepted >= bufflen) {
3144                                 bufflen *= 2;
3145                                 ST.accept_buff =(reg_trie_accepted*)
3146                                     SvGROW(sv_accept_buff,
3147                                         bufflen * sizeof(reg_trie_accepted));
3148                             }
3149                             SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3150                                 + sizeof(reg_trie_accepted));
3151
3152
3153                             ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3154                             ST.accept_buff[ST.accepted].endpos = uc;
3155                             ++ST.accepted;
3156                         } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3157                     }
3158 #undef got_wordnum
3159
3160                     DEBUG_TRIE_EXECUTE_r({
3161                                 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3162                                 PerlIO_printf( Perl_debug_log,
3163                                     "%*s  %sState: %4"UVxf" Accepted: %4"UVxf" ",
3164                                     2+depth * 2, "", PL_colors[4],
3165                                     (UV)state, (UV)ST.accepted );
3166                     });
3167
3168                     if ( base ) {
3169                         REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3170                                              uscan, len, uvc, charid, foldlen,
3171                                              foldbuf, uniflags);
3172
3173                         if (charid &&
3174                              (base + charid > trie->uniquecharcount )
3175                              && (base + charid - 1 - trie->uniquecharcount
3176                                     < trie->lasttrans)
3177                              && trie->trans[base + charid - 1 -
3178                                     trie->uniquecharcount].check == state)
3179                         {
3180                             state = trie->trans[base + charid - 1 -
3181                                 trie->uniquecharcount ].next;
3182                         }
3183                         else {
3184                             state = 0;
3185                         }
3186                         uc += len;
3187
3188                     }
3189                     else {
3190                         state = 0;
3191                     }
3192                     DEBUG_TRIE_EXECUTE_r(
3193                         PerlIO_printf( Perl_debug_log,
3194                             "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3195                             charid, uvc, (UV)state, PL_colors[5] );
3196                     );
3197                 }
3198                 if (!ST.accepted )
3199                    sayNO;
3200
3201                 DEBUG_EXECUTE_r(
3202                     PerlIO_printf( Perl_debug_log,
3203                         "%*s  %sgot %"IVdf" possible matches%s\n",
3204                         REPORT_CODE_OFF + depth * 2, "",
3205                         PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3206                 );
3207             }}
3208             goto trie_first_try; /* jump into the fail handler */
3209             /* NOTREACHED */
3210         case TRIE_next_fail: /* we failed - try next alterative */
3211             if ( ST.jump) {
3212                 REGCP_UNWIND(ST.cp);
3213                 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3214                     PL_regoffs[n].end = -1;
3215                 *PL_reglastparen = n;
3216             }
3217           trie_first_try:
3218             if (do_cutgroup) {
3219                 do_cutgroup = 0;
3220                 no_final = 0;
3221             }
3222
3223             if ( ST.jump) {
3224                 ST.lastparen = *PL_reglastparen;
3225                 REGCP_SET(ST.cp);
3226             }
3227             if ( ST.accepted == 1 ) {
3228                 /* only one choice left - just continue */
3229                 DEBUG_EXECUTE_r({
3230                     AV *const trie_words
3231                         = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3232                     SV ** const tmp = av_fetch( trie_words,
3233                         ST.accept_buff[ 0 ].wordnum-1, 0 );
3234                     SV *sv= tmp ? sv_newmortal() : NULL;
3235
3236                     PerlIO_printf( Perl_debug_log,
3237                         "%*s  %sonly one match left: #%d <%s>%s\n",
3238                         REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3239                         ST.accept_buff[ 0 ].wordnum,
3240                         tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3241                                 PL_colors[0], PL_colors[1],
3242                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3243                             )
3244                         : "not compiled under -Dr",
3245                         PL_colors[5] );
3246                 });
3247                 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3248                 /* in this case we free tmps/leave before we call regmatch
3249                    as we wont be using accept_buff again. */
3250
3251                 locinput = PL_reginput;
3252                 nextchr = UCHARAT(locinput);
3253                 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3254                     scan = ST.B;
3255                 else
3256                     scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3257                 if (!has_cutgroup) {
3258                     FREETMPS;
3259                     LEAVE;
3260                 } else {
3261                     ST.accepted--;
3262                     PUSH_YES_STATE_GOTO(TRIE_next, scan);
3263                 }
3264
3265                 continue; /* execute rest of RE */
3266             }
3267
3268             if ( !ST.accepted-- ) {
3269                 DEBUG_EXECUTE_r({
3270                     PerlIO_printf( Perl_debug_log,
3271                         "%*s  %sTRIE failed...%s\n",
3272                         REPORT_CODE_OFF+depth*2, "",
3273                         PL_colors[4],
3274                         PL_colors[5] );
3275                 });
3276                 FREETMPS;
3277                 LEAVE;
3278                 sayNO_SILENT;
3279                 /*NOTREACHED*/
3280             }
3281
3282             /*
3283                There are at least two accepting states left.  Presumably
3284                the number of accepting states is going to be low,
3285                typically two. So we simply scan through to find the one
3286                with lowest wordnum.  Once we find it, we swap the last
3287                state into its place and decrement the size. We then try to
3288                match the rest of the pattern at the point where the word
3289                ends. If we succeed, control just continues along the
3290                regex; if we fail we return here to try the next accepting
3291                state
3292              */
3293
3294             {
3295                 U32 best = 0;
3296                 U32 cur;
3297                 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3298                     DEBUG_TRIE_EXECUTE_r(
3299                         PerlIO_printf( Perl_debug_log,
3300                             "%*s  %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3301                             REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3302                             (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3303                             ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3304                     );
3305
3306                     if (ST.accept_buff[cur].wordnum <
3307                             ST.accept_buff[best].wordnum)
3308                         best = cur;
3309                 }
3310
3311                 DEBUG_EXECUTE_r({
3312                     AV *const trie_words
3313                         = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3314                     SV ** const tmp = av_fetch( trie_words,
3315                         ST.accept_buff[ best ].wordnum - 1, 0 );
3316                     regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3317                                     ST.B :
3318                                     ST.me + ST.jump[ST.accept_buff[best].wordnum];
3319                     SV *sv= tmp ? sv_newmortal() : NULL;
3320
3321                     PerlIO_printf( Perl_debug_log,
3322                         "%*s  %strying alternation #%d <%s> at node #%d %s\n",
3323                         REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3324                         ST.accept_buff[best].wordnum,
3325                         tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3326                                 PL_colors[0], PL_colors[1],
3327                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3328                             ) : "not compiled under -Dr",
3329                             REG_NODE_NUM(nextop),
3330                         PL_colors[5] );
3331                 });
3332
3333                 if ( best<ST.accepted ) {
3334                     reg_trie_accepted tmp = ST.accept_buff[ best ];
3335                     ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3336                     ST.accept_buff[ ST.accepted ] = tmp;
3337                     best = ST.accepted;
3338                 }
3339                 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3340                 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3341                     scan = ST.B;
3342                 } else {
3343                     scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3344                 }
3345                 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3346                 /* NOTREACHED */
3347             }
3348             /* NOTREACHED */
3349         case TRIE_next:
3350             /* we dont want to throw this away, see bug 57042*/
3351             if (oreplsv != GvSV(PL_replgv))
3352                 sv_setsv(oreplsv, GvSV(PL_replgv));
3353             FREETMPS;
3354             LEAVE;
3355             sayYES;
3356 #undef  ST
3357
3358         case EXACT: {
3359             char *s = STRING(scan);
3360             ln = STR_LEN(scan);
3361             if (do_utf8 != UTF) {
3362                 /* The target and the pattern have differing utf8ness. */
3363                 char *l = locinput;
3364                 const char * const e = s + ln;
3365
3366                 if (do_utf8) {
3367                     /* The target is utf8, the pattern is not utf8. */
3368                     while (s < e) {
3369                         STRLEN ulen;
3370                         if (l >= PL_regeol)
3371                              sayNO;
3372                         if (NATIVE_TO_UNI(*(U8*)s) !=
3373                             utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3374                                             uniflags))
3375                              sayNO;
3376                         l += ulen;
3377                         s ++;
3378                     }
3379                 }
3380                 else {
3381                     /* The target is not utf8, the pattern is utf8. */
3382                     while (s < e) {
3383                         STRLEN ulen;
3384                         if (l >= PL_regeol)
3385                             sayNO;
3386                         if (NATIVE_TO_UNI(*((U8*)l)) !=
3387                             utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3388                                            uniflags))
3389                             sayNO;
3390                         s += ulen;
3391                         l ++;
3392                     }
3393                 }
3394                 locinput = l;
3395                 nextchr = UCHARAT(locinput);
3396                 break;
3397             }
3398             /* The target and the pattern have the same utf8ness. */
3399             /* Inline the first character, for speed. */
3400             if (UCHARAT(s) != nextchr)
3401                 sayNO;
3402             if (PL_regeol - locinput < ln)
3403                 sayNO;
3404             if (ln > 1 && memNE(s, locinput, ln))
3405                 sayNO;
3406             locinput += ln;
3407             nextchr = UCHARAT(locinput);
3408             break;
3409             }
3410         case EXACTFL:
3411             PL_reg_flags |= RF_tainted;
3412             /* FALL THROUGH */
3413         case EXACTF: {
3414             char * const s = STRING(scan);
3415             ln = STR_LEN(scan);
3416
3417             if (do_utf8 || UTF) {
3418               /* Either target or the pattern are utf8. */
3419                 const char * const l = locinput;
3420                 char *e = PL_regeol;
3421
3422                 if (ibcmp_utf8(s, 0,  ln, (bool)UTF,
3423                                l, &e, 0,  do_utf8)) {
3424                      /* One more case for the sharp s:
3425                       * pack("U0U*", 0xDF) =~ /ss/i,
3426                       * the 0xC3 0x9F are the UTF-8
3427                       * byte sequence for the U+00DF. */
3428
3429                      if (!(do_utf8 &&
3430                            toLOWER(s[0]) == 's' &&
3431                            ln >= 2 &&
3432                            toLOWER(s[1]) == 's' &&
3433                            (U8)l[0] == 0xC3 &&
3434                            e - l >= 2 &&
3435                            (U8)l[1] == 0x9F))
3436                           sayNO;
3437                 }
3438                 locinput = e;
3439                 nextchr = UCHARAT(locinput);
3440                 break;
3441             }
3442
3443             /* Neither the target and the pattern are utf8. */
3444
3445             /* Inline the first character, for speed. */
3446             if (UCHARAT(s) != nextchr &&
3447                 UCHARAT(s) != ((OP(scan) == EXACTF)
3448                                ? PL_fold : PL_fold_locale)[nextchr])
3449                 sayNO;
3450             if (PL_regeol - locinput < ln)
3451                 sayNO;
3452             if (ln > 1 && (OP(scan) == EXACTF
3453                            ? ibcmp(s, locinput, ln)
3454                            : ibcmp_locale(s, locinput, ln)))
3455                 sayNO;
3456             locinput += ln;
3457             nextchr = UCHARAT(locinput);
3458             break;
3459             }
3460         case BOUNDL:
3461         case NBOUNDL:
3462             PL_reg_flags |= RF_tainted;
3463             /* FALL THROUGH */
3464         case BOUND:
3465         case NBOUND:
3466             /* was last char in word? */
3467             if (do_utf8) {
3468                 if (locinput == PL_bostr)
3469                     ln = '\n';
3470                 else {
3471                     const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3472
3473                     ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3474                 }
3475                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3476                     ln = isALNUM_uni(ln);
3477                     LOAD_UTF8_CHARCLASS_ALNUM();
3478                     n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3479                 }
3480                 else {
3481                     ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3482                     n = isALNUM_LC_utf8((U8*)locinput);
3483                 }
3484             }
3485             else {
3486                 ln = (locinput != PL_bostr) ?
3487                     UCHARAT(locinput - 1) : '\n';
3488                 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3489                     ln = isALNUM(ln);
3490                     n = isALNUM(nextchr);
3491                 }
3492                 else {
3493                     ln = isALNUM_LC(ln);
3494                     n = isALNUM_LC(nextchr);
3495                 }
3496             }
3497             if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3498                                     OP(scan) == BOUNDL))
3499                     sayNO;
3500             break;
3501         case ANYOF:
3502             if (do_utf8) {
3503                 STRLEN inclasslen = PL_regeol - locinput;
3504
3505                 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3506                     goto anyof_fail;
3507                 if (locinput >= PL_regeol)
3508                     sayNO;
3509                 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3510                 nextchr = UCHARAT(locinput);
3511                 break;
3512             }
3513             else {
3514                 if (nextchr < 0)
3515                     nextchr = UCHARAT(locinput);
3516                 if (!REGINCLASS(rex, scan, (U8*)locinput))
3517                     goto anyof_fail;
3518                 if (!nextchr && locinput >= PL_regeol)
3519                     sayNO;
3520                 nextchr = UCHARAT(++locinput);
3521                 break;
3522             }
3523         anyof_fail:
3524             /* If we might have the case of the German sharp s
3525              * in a casefolding Unicode character class. */
3526
3527             if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3528                  locinput += SHARP_S_SKIP;
3529                  nextchr = UCHARAT(locinput);
3530             }
3531             else
3532                  sayNO;
3533             break;
3534         /* Special char classes - The defines start on line 129 or so */
3535         CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3536         CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3537
3538         CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3539         CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3540
3541         CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3542         CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3543
3544         case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3545                        a Unicode extended Grapheme Cluster */
3546             /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3547               extended Grapheme Cluster is:
3548
3549                CR LF
3550                | Prepend* Begin Extend*
3551                | .
3552
3553                Begin is (Hangul-syllable | ! Control)
3554                Extend is (Grapheme_Extend | Spacing_Mark)
3555                Control is [ GCB_Control CR LF ]
3556
3557                The discussion below shows how the code for CLUMP is derived
3558                from this regex.  Note that most of these concepts are from
3559                property values of the Grapheme Cluster Boundary (GCB) property.
3560                No code point can have multiple property values for a given
3561                property.  Thus a code point in Prepend can't be in Control, but
3562                it must be in !Control.  This is why Control above includes
3563                GCB_Control plus CR plus LF.  The latter two are used in the GCB
3564                property separately, and so can't be in GCB_Control, even though
3565                they logically are controls.  Control is not the same as gc=cc,
3566                but includes format and other characters as well.
3567
3568                The Unicode definition of Hangul-syllable is:
3569                    L+
3570                    | (L* ( ( V | LV ) V* | LVT ) T*)
3571                    | T+
3572                   )
3573                Each of these is a value for the GCB property, and hence must be
3574                disjoint, so the order they are tested is immaterial, so the
3575                above can safely be changed to
3576                    T+
3577                    | L+
3578                    | (L* ( LVT | ( V | LV ) V*) T*)
3579
3580                The last two terms can be combined like this:
3581                    L* ( L
3582                         | (( LVT | ( V | LV ) V*) T*))
3583
3584                And refactored into this:
3585                    L* (L | LVT T* | V  V* T* | LV  V* T*)
3586
3587                That means that if we have seen any L's at all we can quit
3588                there, but if the next character is a LVT, a V or and LV we
3589                should keep going.
3590
3591                There is a subtlety with Prepend* which showed up in testing.
3592                Note that the Begin, and only the Begin is required in:
3593                 | Prepend* Begin Extend*
3594                Also, Begin contains '! Control'.  A Prepend must be a '!
3595                Control', which means it must be a Begin.  What it comes down to
3596                is that if we match Prepend* and then find no suitable Begin
3597                afterwards, that if we backtrack the last Prepend, that one will
3598                be a suitable Begin.
3599             */
3600
3601             if (locinput >= PL_regeol)
3602                 sayNO;
3603             if  (! do_utf8) {
3604
3605                 /* Match either CR LF  or '.', as all the other possibilities
3606                  * require utf8 */
3607                 locinput++;         /* Match the . or CR */
3608                 if (nextchr == '\r'
3609                     && locinput < PL_regeol
3610                     && UCHARAT(locinput) == '\n') locinput++;
3611             }
3612             else {
3613
3614                 /* Utf8: See if is ( CR LF ); already know that locinput <
3615                  * PL_regeol, so locinput+1 is in bounds */
3616                 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3617                     locinput += 2;
3618                 }
3619                 else {
3620                     /* In case have to backtrack to beginning, then match '.' */
3621                     char *starting = locinput;
3622
3623                     /* In case have to backtrack the last prepend */
3624                     char *previous_prepend = 0;
3625
3626                     LOAD_UTF8_CHARCLASS_GCB();
3627
3628                     /* Match (prepend)* */
3629                     while (locinput < PL_regeol
3630                            && swash_fetch(PL_utf8_X_prepend,
3631                                           (U8*)locinput, do_utf8))
3632                     {
3633                         previous_prepend = locinput;
3634                         locinput += UTF8SKIP(locinput);
3635                     }
3636
3637                     /* As noted above, if we matched a prepend character, but
3638                      * the next thing won't match, back off the last prepend we
3639                      * matched, as it is guaranteed to match the begin */
3640                     if (previous_prepend
3641                         && (locinput >=  PL_regeol
3642                             || ! swash_fetch(PL_utf8_X_begin,
3643                                              (U8*)locinput, do_utf8)))
3644                     {
3645                         locinput = previous_prepend;
3646                     }
3647
3648                     /* Note that here we know PL_regeol > locinput, as we
3649                      * tested that upon input to this switch case, and if we
3650                      * moved locinput forward, we tested the result just above
3651                      * and it either passed, or we backed off so that it will
3652                      * now pass */
3653                     if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) {
3654
3655                         /* Here did not match the required 'Begin' in the
3656                          * second term.  So just match the very first
3657                          * character, the '.' of the final term of the regex */
3658                         locinput = starting + UTF8SKIP(starting);
3659                     } else {
3660
3661                         /* Here is the beginning of a character that can have
3662                          * an extender.  It is either a hangul syllable, or a
3663                          * non-control */
3664                         if (swash_fetch(PL_utf8_X_non_hangul,
3665                                         (U8*)locinput, do_utf8))
3666                         {
3667
3668                             /* Here not a Hangul syllable, must be a
3669                              * ('!  * Control') */
3670                             locinput += UTF8SKIP(locinput);
3671                         } else {
3672
3673                             /* Here is a Hangul syllable.  It can be composed
3674                              * of several individual characters.  One
3675                              * possibility is T+ */
3676                             if (swash_fetch(PL_utf8_X_T,
3677                                             (U8*)locinput, do_utf8))
3678                             {
3679                                 while (locinput < PL_regeol
3680                                         && swash_fetch(PL_utf8_X_T,
3681                                                         (U8*)locinput, do_utf8))
3682                                 {
3683                                     locinput += UTF8SKIP(locinput);
3684                                 }
3685                             } else {
3686
3687                                 /* Here, not T+, but is a Hangul.  That means
3688                                  * it is one of the others: L, LV, LVT or V,
3689                                  * and matches:
3690                                  * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3691
3692                                 /* Match L*           */
3693                                 while (locinput < PL_regeol
3694                                         && swash_fetch(PL_utf8_X_L,
3695                                                         (U8*)locinput, do_utf8))
3696                                 {
3697                                     locinput += UTF8SKIP(locinput);
3698                                 }
3699
3700                                 /* Here, have exhausted L*.  If the next
3701                                  * character is not an LV, LVT nor V, it means
3702                                  * we had to have at least one L, so matches L+
3703                                  * in the original equation, we have a complete
3704                                  * hangul syllable.  Are done. */
3705
3706                                 if (locinput < PL_regeol
3707                                     && swash_fetch(PL_utf8_X_LV_LVT_V,
3708                                                     (U8*)locinput, do_utf8))
3709                                 {
3710
3711                                     /* Otherwise keep going.  Must be LV, LVT
3712                                      * or V.  See if LVT */
3713                                     if (swash_fetch(PL_utf8_X_LVT,
3714                                                     (U8*)locinput, do_utf8))
3715                                     {
3716                                         locinput += UTF8SKIP(locinput);
3717                                     } else {
3718
3719                                         /* Must be  V or LV.  Take it, then
3720                                          * match V*     */
3721                                         locinput += UTF8SKIP(locinput);
3722                                         while (locinput < PL_regeol
3723                                                 && swash_fetch(PL_utf8_X_V,
3724                                                          (U8*)locinput, do_utf8))
3725                                         {
3726                                             locinput += UTF8SKIP(locinput);
3727                                         }
3728                                     }
3729
3730                                     /* And any of LV, LVT, or V can be followed
3731                                      * by T*            */
3732                                     while (locinput < PL_regeol
3733                                            && swash_fetch(PL_utf8_X_T,
3734                                                            (U8*)locinput,
3735                                                            do_utf8))
3736                                     {
3737                                         locinput += UTF8SKIP(locinput);
3738                                     }
3739                                 }
3740                             }
3741                         }
3742
3743                         /* Match any extender */
3744                         while (locinput < PL_regeol
3745                                 && swash_fetch(PL_utf8_X_extend,
3746                                                 (U8*)locinput, do_utf8))
3747                         {
3748                             locinput += UTF8SKIP(locinput);
3749                         }
3750                     }
3751                 }
3752                 if (locinput > PL_regeol) sayNO;
3753             }
3754             nextchr = UCHARAT(locinput);
3755             break;
3756
3757         case NREFFL:
3758         {
3759             char *s;
3760             char type;
3761             PL_reg_flags |= RF_tainted;
3762             /* FALL THROUGH */
3763         case NREF:
3764         case NREFF:
3765             type = OP(scan);
3766             n = reg_check_named_buff_matched(rex,scan);
3767
3768             if ( n ) {
3769                 type = REF + ( type - NREF );
3770                 goto do_ref;
3771             } else {
3772                 sayNO;
3773             }
3774             /* unreached */
3775         case REFFL:
3776             PL_reg_flags |= RF_tainted;
3777             /* FALL THROUGH */
3778         case REF:
3779         case REFF:
3780             n = ARG(scan);  /* which paren pair */
3781             type = OP(scan);
3782           do_ref:
3783             ln = PL_regoffs[n].start;
3784             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
3785             if (*PL_reglastparen < n || ln == -1)
3786                 sayNO;                  /* Do not match unless seen CLOSEn. */
3787             if (ln == PL_regoffs[n].end)
3788                 break;
3789
3790             s = PL_bostr + ln;
3791             if (do_utf8 && type != REF) {       /* REF can do byte comparison */
3792                 char *l = locinput;
3793                 const char *e = PL_bostr + PL_regoffs[n].end;
3794                 /*
3795                  * Note that we can't do the "other character" lookup trick as
3796                  * in the 8-bit case (no pun intended) because in Unicode we
3797                  * have to map both upper and title case to lower case.
3798                  */
3799                 if (type == REFF) {
3800                     while (s < e) {
3801                         STRLEN ulen1, ulen2;
3802                         U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3803                         U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3804
3805                         if (l >= PL_regeol)
3806                             sayNO;
3807                         toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3808                         toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3809                         if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3810                             sayNO;
3811                         s += ulen1;
3812                         l += ulen2;
3813                     }
3814                 }
3815                 locinput = l;
3816                 nextchr = UCHARAT(locinput);
3817                 break;
3818             }
3819
3820             /* Inline the first character, for speed. */
3821             if (UCHARAT(s) != nextchr &&
3822                 (type == REF ||
3823                  (UCHARAT(s) != (type == REFF
3824                                   ? PL_fold : PL_fold_locale)[nextchr])))
3825                 sayNO;
3826             ln = PL_regoffs[n].end - ln;
3827             if (locinput + ln > PL_regeol)
3828                 sayNO;
3829             if (ln > 1 && (type == REF
3830                            ? memNE(s, locinput, ln)
3831                            : (type == REFF
3832                               ? ibcmp(s, locinput, ln)
3833                               : ibcmp_locale(s, locinput, ln))))
3834                 sayNO;
3835             locinput += ln;
3836             nextchr = UCHARAT(locinput);
3837             break;
3838         }
3839         case NOTHING:
3840         case TAIL:
3841             break;
3842         case BACK:
3843             break;
3844
3845 #undef  ST
3846 #define ST st->u.eval
3847         {
3848             SV *ret;
3849             REGEXP *re_sv;
3850             regexp *re;
3851             regexp_internal *rei;
3852             regnode *startpoint;
3853
3854         case GOSTART:
3855         case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3856             if (cur_eval && cur_eval->locinput==locinput) {
3857                 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3858                     Perl_croak(aTHX_ "Infinite recursion in regex");
3859                 if ( ++nochange_depth > max_nochange_depth )
3860                     Perl_croak(aTHX_
3861                         "Pattern subroutine nesting without pos change"
3862                         " exceeded limit in regex");
3863             } else {
3864                 nochange_depth = 0;
3865             }
3866             re_sv = rex_sv;
3867             re = rex;
3868             rei = rexi;
3869             (void)ReREFCNT_inc(rex_sv);
3870             if (OP(scan)==GOSUB) {
3871                 startpoint = scan + ARG2L(scan);
3872                 ST.close_paren = ARG(scan);
3873             } else {
3874                 startpoint = rei->program+1;
3875                 ST.close_paren = 0;
3876             }
3877             goto eval_recurse_doit;
3878             /* NOTREACHED */
3879         case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
3880             if (cur_eval && cur_eval->locinput==locinput) {
3881                 if ( ++nochange_depth > max_nochange_depth )
3882                     Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3883             } else {
3884                 nochange_depth = 0;
3885             }
3886             {
3887                 /* execute the code in the {...} */
3888                 dSP;
3889                 SV ** const before = SP;
3890                 OP_4tree * const oop = PL_op;
3891                 COP * const ocurcop = PL_curcop;
3892                 PAD *old_comppad;
3893                 char *saved_regeol = PL_regeol;
3894
3895                 n = ARG(scan);
3896                 PL_op = (OP_4tree*)rexi->data->data[n];
3897                 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3898                     "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3899                 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3900                 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3901
3902                 if (sv_yes_mark) {
3903                     SV *sv_mrk = get_sv("REGMARK", 1);
3904                     sv_setsv(sv_mrk, sv_yes_mark);
3905                 }
3906
3907                 CALLRUNOPS(aTHX);                       /* Scalar context. */
3908                 SPAGAIN;
3909                 if (SP == before)
3910                     ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
3911                 else {
3912                     ret = POPs;
3913                     PUTBACK;
3914                 }
3915
3916                 PL_op = oop;
3917                 PAD_RESTORE_LOCAL(old_comppad);
3918                 PL_curcop = ocurcop;
3919                 PL_regeol = saved_regeol;
3920                 if (!logical) {
3921                     /* /(?{...})/ */
3922                     sv_setsv(save_scalar(PL_replgv), ret);
3923                     break;
3924                 }
3925             }
3926             if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3927                 logical = 0;
3928                 {
3929                     /* extract RE object from returned value; compiling if
3930                      * necessary */
3931                     MAGIC *mg = NULL;
3932                     REGEXP *rx = NULL;
3933
3934                     if (SvROK(ret)) {
3935                         SV *const sv = SvRV(ret);
3936
3937                         if (SvTYPE(sv) == SVt_REGEXP) {
3938                             rx = (REGEXP*) sv;
3939                         } else if (SvSMAGICAL(sv)) {
3940                             mg = mg_find(sv, PERL_MAGIC_qr);
3941                             assert(mg);
3942                         }
3943                     } else if (SvTYPE(ret) == SVt_REGEXP) {
3944                         rx = (REGEXP*) ret;
3945                     } else if (SvSMAGICAL(ret)) {
3946                         if (SvGMAGICAL(ret)) {
3947                             /* I don't believe that there is ever qr magic
3948                                here.  */
3949                             assert(!mg_find(ret, PERL_MAGIC_qr));
3950                             sv_unmagic(ret, PERL_MAGIC_qr);
3951                         }
3952                         else {
3953                             mg = mg_find(ret, PERL_MAGIC_qr);
3954                             /* testing suggests mg only ends up non-NULL for
3955                                scalars who were upgraded and compiled in the
3956                                else block below. In turn, this is only
3957                                triggered in the "postponed utf8 string" tests
3958                                in t/op/pat.t  */
3959                         }
3960                     }
3961
3962                     if (mg) {
3963                         rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
3964                         assert(rx);
3965                     }
3966                     if (rx) {
3967                         rx = reg_temp_copy(NULL, rx);
3968                     }
3969                     else {
3970                         U32 pm_flags = 0;
3971                         const I32 osize = PL_regsize;
3972
3973                         if (DO_UTF8(ret)) {
3974                             assert (SvUTF8(ret));
3975                         } else if (SvUTF8(ret)) {
3976                             /* Not doing UTF-8, despite what the SV says. Is
3977                                this only if we're trapped in use 'bytes'?  */
3978                             /* Make a copy of the octet sequence, but without
3979                                the flag on, as the compiler now honours the
3980                                SvUTF8 flag on ret.  */
3981                             STRLEN len;
3982                             const char *const p = SvPV(ret, len);
3983                             ret = newSVpvn_flags(p, len, SVs_TEMP);
3984                         }
3985                         rx = CALLREGCOMP(ret, pm_flags);
3986                         if (!(SvFLAGS(ret)
3987                               & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3988                                  | SVs_GMG))) {
3989                             /* This isn't a first class regexp. Instead, it's
3990                                caching a regexp onto an existing, Perl visible
3991                                scalar.  */
3992                             sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3993                         }
3994                         PL_regsize = osize;
3995                     }
3996                     re_sv = rx;
3997                     re = (struct regexp *)SvANY(rx);
3998                 }
3999                 RXp_MATCH_COPIED_off(re);
4000                 re->subbeg = rex->subbeg;
4001                 re->sublen = rex->sublen;
4002                 rei = RXi_GET(re);
4003                 DEBUG_EXECUTE_r(
4004                     debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
4005                         "Matching embedded");
4006                 );
4007                 startpoint = rei->program + 1;
4008                 ST.close_paren = 0; /* only used for GOSUB */
4009                 /* borrowed from regtry */
4010                 if (PL_reg_start_tmpl <= re->nparens) {
4011                     PL_reg_start_tmpl = re->nparens*3/2 + 3;
4012                     if(PL_reg_start_tmp)
4013                         Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4014                     else
4015                         Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4016                 }
4017
4018         eval_recurse_doit: /* Share code with GOSUB below this line */
4019                 /* run the pattern returned from (??{...}) */
4020                 ST.cp = regcppush(0);   /* Save *all* the positions. */
4021                 REGCP_SET(ST.lastcp);
4022
4023                 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4024
4025                 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4026                 PL_reglastparen = &re->lastparen;
4027                 PL_reglastcloseparen = &re->lastcloseparen;
4028                 re->lastparen = 0;
4029                 re->lastcloseparen = 0;
4030
4031                 PL_reginput = locinput;
4032                 PL_regsize = 0;
4033
4034                 /* XXXX This is too dramatic a measure... */
4035                 PL_reg_maxiter = 0;
4036
4037                 ST.toggle_reg_flags = PL_reg_flags;
4038                 if (RX_UTF8(re_sv))
4039                     PL_reg_flags |= RF_utf8;
4040                 else
4041                     PL_reg_flags &= ~RF_utf8;
4042                 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4043
4044                 ST.prev_rex = rex_sv;
4045                 ST.prev_curlyx = cur_curlyx;
4046                 SETREX(rex_sv,re_sv);
4047                 rex = re;
4048                 rexi = rei;
4049                 cur_curlyx = NULL;
4050                 ST.B = next;
4051                 ST.prev_eval = cur_eval;
4052                 cur_eval = st;
4053                 /* now continue from first node in postoned RE */
4054                 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4055                 /* NOTREACHED */
4056             }
4057             /* logical is 1,   /(?(?{...})X|Y)/ */
4058             sw = (bool)SvTRUE(ret);
4059             logical = 0;
4060             break;
4061         }
4062
4063         case EVAL_AB: /* cleanup after a successful (??{A})B */
4064             /* note: this is called twice; first after popping B, then A */
4065             PL_reg_flags ^= ST.toggle_reg_flags;
4066             ReREFCNT_dec(rex_sv);
4067             SETREX(rex_sv,ST.prev_rex);
4068             rex = (struct regexp *)SvANY(rex_sv);
4069             rexi = RXi_GET(rex);
4070             regcpblow(ST.cp);
4071             cur_eval = ST.prev_eval;
4072             cur_curlyx = ST.prev_curlyx;
4073
4074             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4075             PL_reglastparen = &rex->lastparen;
4076             PL_reglastcloseparen = &rex->lastcloseparen;
4077             /* also update PL_regoffs */
4078             PL_regoffs = rex->offs;
4079
4080             /* XXXX This is too dramatic a measure... */
4081             PL_reg_maxiter = 0;
4082             if ( nochange_depth )
4083                 nochange_depth--;
4084             sayYES;
4085
4086
4087         case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4088             /* note: this is called twice; first after popping B, then A */
4089             PL_reg_flags ^= ST.toggle_reg_flags;
4090             ReREFCNT_dec(rex_sv);
4091             SETREX(rex_sv,ST.prev_rex);
4092             rex = (struct regexp *)SvANY(rex_sv);
4093             rexi = RXi_GET(rex);
4094             /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4095             PL_reglastparen = &rex->lastparen;
4096             PL_reglastcloseparen = &rex->lastcloseparen;
4097
4098             PL_reginput = locinput;
4099             REGCP_UNWIND(ST.lastcp);
4100             regcppop(rex);
4101             cur_eval = ST.prev_eval;
4102             cur_curlyx = ST.prev_curlyx;
4103             /* XXXX This is too dramatic a measure... */
4104             PL_reg_maxiter = 0;
4105             if ( nochange_depth )
4106                 nochange_depth--;
4107             sayNO_SILENT;
4108 #undef ST
4109
4110         case OPEN:
4111             n = ARG(scan);  /* which paren pair */
4112             PL_reg_start_tmp[n] = locinput;
4113             if (n > PL_regsize)
4114                 PL_regsize = n;
4115             lastopen = n;
4116             break;
4117         case CLOSE:
4118             n = ARG(scan);  /* which paren pair */
4119             PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4120             PL_regoffs[n].end = locinput - PL_bostr;
4121             /*if (n > PL_regsize)
4122                 PL_regsize = n;*/
4123             if (n > *PL_reglastparen)
4124                 *PL_reglastparen = n;
4125             *PL_reglastcloseparen = n;
4126             if (cur_eval && cur_eval->u.eval.close_paren == n) {
4127                 goto fake_end;
4128             }
4129             break;
4130         case ACCEPT:
4131             if (ARG(scan)){
4132                 regnode *cursor;
4133                 for (cursor=scan;
4134                      cursor && OP(cursor)!=END;
4135                      cursor=regnext(cursor))
4136                 {
4137                     if ( OP(cursor)==CLOSE ){
4138                         n = ARG(cursor);
4139                         if ( n <= lastopen ) {
4140                             PL_regoffs[n].start
4141                                 = PL_reg_start_tmp[n] - PL_bostr;
4142                             PL_regoffs[n].end = locinput - PL_bostr;
4143                             /*if (n > PL_regsize)
4144                             PL_regsize = n;*/
4145                             if (n > *PL_reglastparen)
4146                                 *PL_reglastparen = n;
4147                             *PL_reglastcloseparen = n;
4148                             if ( n == ARG(scan) || (cur_eval &&
4149                                 cur_eval->u.eval.close_paren == n))
4150                                 break;
4151                         }
4152                     }
4153                 }
4154             }
4155             goto fake_end;
4156             /*NOTREACHED*/
4157         case GROUPP:
4158             n = ARG(scan);  /* which paren pair */
4159             sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4160             break;
4161         case NGROUPP:
4162             /* reg_check_named_buff_matched returns 0 for no match */
4163             sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
4164             break;
4165         case INSUBP:
4166             n = ARG(scan);
4167             sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4168             break;
4169         case DEFINEP:
4170             sw = 0;
4171             break;
4172         case IFTHEN:
4173             PL_reg_leftiter = PL_reg_maxiter;           /* Void cache */
4174             if (sw)
4175                 next = NEXTOPER(NEXTOPER(scan));
4176             else {
4177                 next = scan + ARG(scan);
4178                 if (OP(next) == IFTHEN) /* Fake one. */
4179                     next = NEXTOPER(NEXTOPER(next));
4180             }
4181             break;
4182         case LOGICAL:
4183             logical = scan->flags;
4184             break;
4185
4186 /*******************************************************************
4187
4188 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4189 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4190 STAR/PLUS/CURLY/CURLYN are used instead.)
4191
4192 A*B is compiled as <CURLYX><A><WHILEM><B>
4193
4194 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4195 state, which contains the current count, initialised to -1. It also sets
4196 cur_curlyx to point to this state, with any previous value saved in the
4197 state block.
4198
4199 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4200 since the pattern may possibly match zero times (i.e. it's a while {} loop
4201 rather than a do {} while loop).
4202
4203 Each entry to WHILEM represents a successful match of A. The count in the
4204 CURLYX block is incremented, another WHILEM state is pushed, and execution
4205 passes to A or B depending on greediness and the current count.
4206
4207 For example, if matching against the string a1a2a3b (where the aN are
4208 substrings that match /A/), then the match progresses as follows: (the
4209 pushed states are interspersed with the bits of strings matched so far):
4210
4211     <CURLYX cnt=-1>
4212     <CURLYX cnt=0><WHILEM>
4213     <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4214     <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4215     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4216     <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4217
4218 (Contrast this with something like CURLYM, which maintains only a single
4219 backtrack state:
4220
4221     <CURLYM cnt=0> a1
4222     a1 <CURLYM cnt=1> a2
4223     a1 a2 <CURLYM cnt=2> a3
4224     a1 a2 a3 <CURLYM cnt=3> b
4225 )
4226
4227 Each WHILEM state block marks a point to backtrack to upon partial failure
4228 of A or B, and also contains some minor state data related to that
4229 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4230 overall state, such as the count, and pointers to the A and B ops.
4231
4232 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4233 must always point to the *current* CURLYX block, the rules are:
4234
4235 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4236 and set cur_curlyx to point the new block.
4237
4238 When popping the CURLYX block after a successful or unsuccessful match,
4239 restore the previous cur_curlyx.
4240
4241 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4242 to the outer one saved in the CURLYX block.
4243
4244 When popping the WHILEM block after a successful or unsuccessful B match,
4245 restore the previous cur_curlyx.
4246
4247 Here's an example for the pattern (AI* BI)*BO
4248 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4249
4250 cur_
4251 curlyx backtrack stack
4252 ------ ---------------
4253 NULL
4254 CO     <CO prev=NULL> <WO>
4255 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4256 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4257 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4258
4259 At this point the pattern succeeds, and we work back down the stack to
4260 clean up, restoring as we go:
4261
4262 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4263 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4264 CO     <CO prev=NULL> <WO>
4265 NULL
4266
4267 *******************************************************************/
4268
4269 #define ST st->u.curlyx
4270
4271         case CURLYX:    /* start of /A*B/  (for complex A) */
4272         {
4273             /* No need to save/restore up to this paren */
4274             I32 parenfloor = scan->flags;
4275
4276             assert(next); /* keep Coverity happy */
4277             if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4278                 next += ARG(next);
4279
4280             /* XXXX Probably it is better to teach regpush to support
4281                parenfloor > PL_regsize... */
4282             if (parenfloor > (I32)*PL_reglastparen)
4283                 parenfloor = *PL_reglastparen; /* Pessimization... */
4284
4285             ST.prev_curlyx= cur_curlyx;
4286             cur_curlyx = st;
4287             ST.cp = PL_savestack_ix;
4288
4289             /* these fields contain the state of the current curly.
4290              * they are accessed by subsequent WHILEMs */
4291             ST.parenfloor = parenfloor;
4292             ST.min = ARG1(scan);
4293             ST.max = ARG2(scan);
4294             ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4295             ST.B = next;
4296             ST.minmod = minmod;
4297             minmod = 0;
4298             ST.count = -1;      /* this will be updated by WHILEM */
4299             ST.lastloc = NULL;  /* this will be updated by WHILEM */
4300
4301             PL_reginput = locinput;
4302             PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4303             /* NOTREACHED */
4304         }
4305
4306         case CURLYX_end: /* just finished matching all of A*B */
4307             cur_curlyx = ST.prev_curlyx;
4308             sayYES;
4309             /* NOTREACHED */
4310
4311         case CURLYX_end_fail: /* just failed to match all of A*B */
4312             regcpblow(ST.cp);
4313             cur_curlyx = ST.prev_curlyx;
4314             sayNO;
4315             /* NOTREACHED */
4316
4317
4318 #undef ST
4319 #define ST st->u.whilem
4320
4321         case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4322         {
4323             /* see the discussion above about CURLYX/WHILEM */
4324             I32 n;
4325             assert(cur_curlyx); /* keep Coverity happy */
4326             n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4327             ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4328             ST.cache_offset = 0;
4329             ST.cache_mask = 0;
4330
4331             PL_reginput = locinput;
4332
4333             DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4334                   "%*s  whilem: matched %ld out of %ld..%ld\n",
4335                   REPORT_CODE_OFF+depth*2, "", (long)n,
4336                   (long)cur_curlyx->u.curlyx.min,
4337                   (long)cur_curlyx->u.curlyx.max)
4338             );
4339
4340             /* First just match a string of min A's. */
4341
4342             if (n < cur_curlyx->u.curlyx.min) {
4343                 cur_curlyx->u.curlyx.lastloc = locinput;
4344                 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4345                 /* NOTREACHED */
4346             }
4347
4348             /* If degenerate A matches "", assume A done. */
4349
4350             if (locinput == cur_curlyx->u.curlyx.lastloc) {
4351                 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4352                    "%*s  whilem: empty match detected, trying continuation...\n",
4353                    REPORT_CODE_OFF+depth*2, "")
4354                 );
4355                 goto do_whilem_B_max;
4356             }
4357
4358             /* super-linear cache processing */
4359
4360             if (scan->flags) {
4361
4362                 if (!PL_reg_maxiter) {
4363                     /* start the countdown: Postpone detection until we
4364                      * know the match is not *that* much linear. */
4365                     PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4366                     /* possible overflow for long strings and many CURLYX's */
4367                     if (PL_reg_maxiter < 0)
4368                         PL_reg_maxiter = I32_MAX;
4369                     PL_reg_leftiter = PL_reg_maxiter;
4370                 }
4371
4372                 if (PL_reg_leftiter-- == 0) {
4373                     /* initialise cache */
4374                     const I32 size = (PL_reg_maxiter + 7)/8;
4375                     if (PL_reg_poscache) {
4376                         if ((I32)PL_reg_poscache_size < size) {
4377                             Renew(PL_reg_poscache, size, char);
4378                             PL_reg_poscache_size = size;
4379                         }
4380                         Zero(PL_reg_poscache, size, char);
4381                     }
4382                     else {
4383                         PL_reg_poscache_size = size;
4384                         Newxz(PL_reg_poscache, size, char);
4385                     }
4386                     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4387       "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4388                               PL_colors[4], PL_colors[5])
4389                     );
4390                 }
4391
4392                 if (PL_reg_leftiter < 0) {
4393                     /* have we already failed at this position? */
4394                     I32 offset, mask;
4395                     offset  = (scan->flags & 0xf) - 1
4396                                 + (locinput - PL_bostr)  * (scan->flags>>4);
4397                     mask    = 1 << (offset % 8);
4398                     offset /= 8;
4399                     if (PL_reg_poscache[offset] & mask) {
4400                         DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4401                             "%*s  whilem: (cache) already tried at this position...\n",
4402                             REPORT_CODE_OFF+depth*2, "")
4403                         );
4404                         sayNO; /* cache records failure */
4405                     }
4406                     ST.cache_offset = offset;
4407                     ST.cache_mask   = mask;
4408                 }
4409             }
4410
4411             /* Prefer B over A for minimal matching. */
4412
4413             if (cur_curlyx->u.curlyx.minmod) {
4414                 ST.save_curlyx = cur_curlyx;
4415                 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4416                 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4417                 REGCP_SET(ST.lastcp);
4418                 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4419                 /* NOTREACHED */
4420             }
4421
4422             /* Prefer A over B for maximal matching. */
4423
4424             if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4425                 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4426                 cur_curlyx->u.curlyx.lastloc = locinput;
4427                 REGCP_SET(ST.lastcp);
4428                 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4429                 /* NOTREACHED */
4430             }
4431             goto do_whilem_B_max;
4432         }
4433         /* NOTREACHED */
4434
4435         case WHILEM_B_min: /* just matched B in a minimal match */
4436         case WHILEM_B_max: /* just matched B in a maximal match */
4437             cur_curlyx = ST.save_curlyx;
4438             sayYES;
4439             /* NOTREACHED */
4440
4441         case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4442             cur_curlyx = ST.save_curlyx;
4443             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4444             cur_curlyx->u.curlyx.count--;
4445             CACHEsayNO;
4446             /* NOTREACHED */
4447
4448         case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4449             REGCP_UNWIND(ST.lastcp);
4450             regcppop(rex);
4451             /* FALL THROUGH */
4452         case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4453             cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4454             cur_curlyx->u.curlyx.count--;
4455             CACHEsayNO;
4456             /* NOTREACHED */
4457
4458         case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4459             REGCP_UNWIND(ST.lastcp);
4460             regcppop(rex);      /* Restore some previous $<digit>s? */
4461             PL_reginput = locinput;
4462             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4463                 "%*s  whilem: failed, trying continuation...\n",
4464                 REPORT_CODE_OFF+depth*2, "")
4465             );
4466           do_whilem_B_max:
4467             if (cur_curlyx->u.curlyx.count >= REG_INFTY
4468                 && ckWARN(WARN_REGEXP)
4469                 && !(PL_reg_flags & RF_warned))
4470             {
4471                 PL_reg_flags |= RF_warned;
4472                 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4473                      "Complex regular subexpression recursion",
4474                      REG_INFTY - 1);
4475             }
4476
4477             /* now try B */
4478             ST.save_curlyx = cur_curlyx;
4479             cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4480             PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4481             /* NOTREACHED */
4482
4483         case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4484             cur_curlyx = ST.save_curlyx;
4485             REGCP_UNWIND(ST.lastcp);
4486             regcppop(rex);
4487
4488             if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4489                 /* Maximum greed exceeded */
4490                 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4491                     && ckWARN(WARN_REGEXP)
4492                     && !(PL_reg_flags & RF_warned))
4493                 {
4494                     PL_reg_flags |= RF_warned;
4495                     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4496                         "%s limit (%d) exceeded",
4497                         "Complex regular subexpression recursion",
4498                         REG_INFTY - 1);
4499                 }
4500                 cur_curlyx->u.curlyx.count--;
4501                 CACHEsayNO;
4502             }
4503
4504             DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4505                 "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4506             );
4507             /* Try grabbing another A and see if it helps. */
4508             PL_reginput = locinput;
4509             cur_curlyx->u.curlyx.lastloc = locinput;
4510             ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4511             REGCP_SET(ST.lastcp);
4512             PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4513             /* NOTREACHED */
4514
4515 #undef  ST
4516 #define ST st->u.branch
4517
4518         case BRANCHJ:       /*  /(...|A|...)/ with long next pointer */
4519             next = scan + ARG(scan);
4520             if (next == scan)
4521                 next = NULL;
4522             scan = NEXTOPER(scan);
4523             /* FALL THROUGH */
4524
4525         case BRANCH:        /*  /(...|A|...)/ */
4526             scan = NEXTOPER(scan); /* scan now points to inner node */
4527             ST.lastparen = *PL_reglastparen;
4528             ST.next_branch = next;
4529             REGCP_SET(ST.cp);
4530             PL_reginput = locinput;
4531
4532             /* Now go into the branch */
4533             if (has_cutgroup) {
4534                 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4535             } else {
4536                 PUSH_STATE_GOTO(BRANCH_next, scan);
4537             }
4538             /* NOTREACHED */
4539         case CUTGROUP:
4540             PL_reginput = locinput;
4541             sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4542                 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4543             PUSH_STATE_GOTO(CUTGROUP_next,next);
4544             /* NOTREACHED */
4545         case CUTGROUP_next_fail:
4546             do_cutgroup = 1;
4547             no_final = 1;
4548             if (st->u.mark.mark_name)
4549                 sv_commit = st->u.mark.mark_name;
4550             sayNO;
4551             /* NOTREACHED */
4552         case BRANCH_next:
4553             sayYES;
4554             /* NOTREACHED */
4555         case BRANCH_next_fail: /* that branch failed; try the next, if any */
4556             if (do_cutgroup) {
4557                 do_cutgroup = 0;
4558                 no_final = 0;
4559             }
4560             REGCP_UNWIND(ST.cp);
4561             for (n = *PL_reglastparen; n > ST.lastparen; n--)
4562                 PL_regoffs[n].end = -1;
4563             *PL_reglastparen = n;
4564             /*dmq: *PL_reglastcloseparen = n; */
4565             scan = ST.next_branch;
4566             /* no more branches? */
4567             if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4568                 DEBUG_EXECUTE_r({
4569                     PerlIO_printf( Perl_debug_log,
4570                         "%*s  %sBRANCH failed...%s\n",
4571                         REPORT_CODE_OFF+depth*2, "",
4572                         PL_colors[4],
4573                         PL_colors[5] );
4574                 });
4575                 sayNO_SILENT;
4576             }
4577             continue; /* execute next BRANCH[J] op */
4578             /* NOTREACHED */
4579
4580         case MINMOD:
4581             minmod = 1;
4582             break;
4583
4584 #undef  ST
4585 #define ST st->u.curlym
4586
4587         case CURLYM:    /* /A{m,n}B/ where A is fixed-length */
4588
4589             /* This is an optimisation of CURLYX that enables us to push
4590              * only a single backtracking state, no matter how many matches
4591              * there are in {m,n}. It relies on the pattern being constant
4592              * length, with no parens to influence future backrefs
4593              */
4594
4595             ST.me = scan;
4596             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4597
4598             /* if paren positive, emulate an OPEN/CLOSE around A */
4599             if (ST.me->flags) {
4600                 U32 paren = ST.me->flags;
4601                 if (paren > PL_regsize)
4602                     PL_regsize = paren;
4603                 if (paren > *PL_reglastparen)
4604                     *PL_reglastparen = paren;
4605                 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4606             }
4607             ST.A = scan;
4608             ST.B = next;
4609             ST.alen = 0;
4610             ST.count = 0;
4611             ST.minmod = minmod;
4612             minmod = 0;
4613             ST.c1 = CHRTEST_UNINIT;
4614             REGCP_SET(ST.cp);
4615
4616             if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4617                 goto curlym_do_B;
4618
4619           curlym_do_A: /* execute the A in /A{m,n}B/  */
4620             PL_reginput = locinput;
4621             PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4622             /* NOTREACHED */
4623
4624         case CURLYM_A: /* we've just matched an A */
4625             locinput = st->locinput;
4626             nextchr = UCHARAT(locinput);
4627
4628             ST.count++;
4629             /* after first match, determine A's length: u.curlym.alen */
4630             if (ST.count == 1) {
4631                 if (PL_reg_match_utf8) {
4632                     char *s = locinput;
4633                     while (s < PL_reginput) {
4634                         ST.alen++;
4635                         s += UTF8SKIP(s);
4636                     }
4637                 }
4638                 else {
4639                     ST.alen = PL_reginput - locinput;
4640                 }
4641                 if (ST.alen == 0)
4642                     ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4643             }
4644             DEBUG_EXECUTE_r(
4645                 PerlIO_printf(Perl_debug_log,
4646                           "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4647                           (int)(REPORT_CODE_OFF+(depth*2)), "",
4648                           (IV) ST.count, (IV)ST.alen)
4649             );
4650
4651             locinput = PL_reginput;
4652
4653             if (cur_eval && cur_eval->u.eval.close_paren &&
4654                 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4655                 goto fake_end;
4656
4657             {
4658                 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4659                 if ( max == REG_INFTY || ST.count < max )
4660                     goto curlym_do_A; /* try to match another A */
4661             }
4662             goto curlym_do_B; /* try to match B */
4663
4664         case CURLYM_A_fail: /* just failed to match an A */
4665             REGCP_UNWIND(ST.cp);
4666
4667             if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4668                 || (cur_eval && cur_eval->u.eval.close_paren &&
4669                     cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4670                 sayNO;
4671
4672           curlym_do_B: /* execute the B in /A{m,n}B/  */
4673             PL_reginput = locinput;
4674             if (ST.c1 == CHRTEST_UNINIT) {
4675                 /* calculate c1 and c2 for possible match of 1st char
4676                  * following curly */
4677                 ST.c1 = ST.c2 = CHRTEST_VOID;
4678                 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4679                     regnode *text_node = ST.B;
4680                     if (! HAS_TEXT(text_node))
4681                         FIND_NEXT_IMPT(text_node);
4682                     /* this used to be
4683
4684                         (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4685
4686                         But the former is redundant in light of the latter.
4687
4688                         if this changes back then the macro for
4689                         IS_TEXT and friends need to change.
4690                      */
4691                     if (PL_regkind[OP(text_node)] == EXACT)
4692                     {
4693
4694                         ST.c1 = (U8)*STRING(text_node);
4695                         ST.c2 =
4696                             (IS_TEXTF(text_node))
4697                             ? PL_fold[ST.c1]
4698                             : (IS_TEXTFL(text_node))
4699                                 ? PL_fold_locale[ST.c1]
4700                                 : ST.c1;
4701                     }
4702                 }
4703             }
4704
4705             DEBUG_EXECUTE_r(
4706                 PerlIO_printf(Perl_debug_log,
4707                     "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4708                     (int)(REPORT_CODE_OFF+(depth*2)),
4709                     "", (IV)ST.count)
4710                 );
4711             if (ST.c1 != CHRTEST_VOID
4712                     && UCHARAT(PL_reginput) != ST.c1
4713                     && UCHARAT(PL_reginput) != ST.c2)
4714             {
4715                 /* simulate B failing */
4716                 DEBUG_OPTIMISE_r(
4717                     PerlIO_printf(Perl_debug_log,
4718                         "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4719                         (int)(REPORT_CODE_OFF+(depth*2)),"",
4720                         (IV)ST.c1,(IV)ST.c2
4721                 ));
4722                 state_num = CURLYM_B_fail;
4723                 goto reenter_switch;
4724             }
4725
4726             if (ST.me->flags) {
4727                 /* mark current A as captured */
4728                 I32 paren = ST.me->flags;
4729                 if (ST.count) {
4730                     PL_regoffs[paren].start
4731                         = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4732                     PL_regoffs[paren].end = PL_reginput - PL_bostr;
4733                     /*dmq: *PL_reglastcloseparen = paren; */
4734                 }
4735                 else
4736                     PL_regoffs[paren].end = -1;
4737                 if (cur_eval && cur_eval->u.eval.close_paren &&
4738                     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4739                 {
4740                     if (ST.count)
4741                         goto fake_end;
4742                     else
4743                         sayNO;
4744                 }
4745             }
4746
4747             PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4748             /* NOTREACHED */
4749
4750         case CURLYM_B_fail: /* just failed to match a B */
4751             REGCP_UNWIND(ST.cp);
4752             if (ST.minmod) {
4753                 I32 max = ARG2(ST.me);
4754                 if (max != REG_INFTY && ST.count == max)
4755                     sayNO;
4756                 goto curlym_do_A; /* try to match a further A */
4757             }
4758             /* backtrack one A */
4759             if (ST.count == ARG1(ST.me) /* min */)
4760                 sayNO;
4761             ST.count--;
4762             locinput = HOPc(locinput, -ST.alen);
4763             goto curlym_do_B; /* try to match B */
4764
4765 #undef ST
4766 #define ST st->u.curly
4767
4768 #define CURLY_SETPAREN(paren, success) \
4769     if (paren) { \
4770         if (success) { \
4771             PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4772             PL_regoffs[paren].end = locinput - PL_bostr; \
4773             *PL_reglastcloseparen = paren; \
4774         } \
4775         else \
4776             PL_regoffs[paren].end = -1; \
4777     }
4778
4779         case STAR:              /*  /A*B/ where A is width 1 */
4780             ST.paren = 0;
4781             ST.min = 0;
4782             ST.max = REG_INFTY;
4783             scan = NEXTOPER(scan);
4784             goto repeat;
4785         case PLUS:              /*  /A+B/ where A is width 1 */
4786             ST.paren = 0;
4787             ST.min = 1;
4788             ST.max = REG_INFTY;
4789             scan = NEXTOPER(scan);
4790             goto repeat;
4791         case CURLYN:            /*  /(A){m,n}B/ where A is width 1 */
4792             ST.paren = scan->flags;     /* Which paren to set */
4793             if (ST.paren > PL_regsize)
4794                 PL_regsize = ST.paren;
4795             if (ST.paren > *PL_reglastparen)
4796                 *PL_reglastparen = ST.paren;
4797             ST.min = ARG1(scan);  /* min to match */
4798             ST.max = ARG2(scan);  /* max to match */
4799             if (cur_eval && cur_eval->u.eval.close_paren &&
4800                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4801                 ST.min=1;
4802                 ST.max=1;
4803             }
4804             scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4805             goto repeat;
4806         case CURLY:             /*  /A{m,n}B/ where A is width 1 */
4807             ST.paren = 0;
4808             ST.min = ARG1(scan);  /* min to match */
4809             ST.max = ARG2(scan);  /* max to match */
4810             scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4811           repeat:
4812             /*
4813             * Lookahead to avoid useless match attempts
4814             * when we know what character comes next.
4815             *
4816             * Used to only do .*x and .*?x, but now it allows
4817             * for )'s, ('s and (?{ ... })'s to be in the way
4818             * of the quantifier and the EXACT-like node.  -- japhy
4819             */
4820
4821             if (ST.min > ST.max) /* XXX make this a compile-time check? */
4822                 sayNO;
4823             if (HAS_TEXT(next) || JUMPABLE(next)) {
4824                 U8 *s;
4825                 regnode *text_node = next;
4826
4827                 if (! HAS_TEXT(text_node))
4828                     FIND_NEXT_IMPT(text_node);
4829
4830                 if (! HAS_TEXT(text_node))
4831                     ST.c1 = ST.c2 = CHRTEST_VOID;
4832                 else {
4833                     if ( PL_regkind[OP(text_node)] != EXACT ) {
4834                         ST.c1 = ST.c2 = CHRTEST_VOID;
4835                         goto assume_ok_easy;
4836                     }
4837                     else
4838                         s = (U8*)STRING(text_node);
4839
4840                     /*  Currently we only get here when
4841
4842                         PL_rekind[OP(text_node)] == EXACT
4843
4844                         if this changes back then the macro for IS_TEXT and
4845                         friends need to change. */
4846                     if (!UTF) {
4847                         ST.c2 = ST.c1 = *s;
4848                         if (IS_TEXTF(text_node))
4849                             ST.c2 = PL_fold[ST.c1];
4850                         else if (IS_TEXTFL(text_node))
4851                             ST.c2 = PL_fold_locale[ST.c1];
4852                     }
4853                     else { /* UTF */
4854                         if (IS_TEXTF(text_node)) {
4855                              STRLEN ulen1, ulen2;
4856                              U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4857                              U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4858
4859                              to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4860                              to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4861 #ifdef EBCDIC
4862                              ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4863                                                     ckWARN(WARN_UTF8) ?
4864                                                     0 : UTF8_ALLOW_ANY);
4865                              ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4866                                                     ckWARN(WARN_UTF8) ?
4867                                                     0 : UTF8_ALLOW_ANY);
4868 #else
4869                              ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4870                                                     uniflags);
4871                              ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4872                                                     uniflags);
4873 #endif
4874                         }
4875                         else {
4876                             ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4877                                                      uniflags);
4878                         }
4879                     }
4880                 }
4881             }
4882             else
4883                 ST.c1 = ST.c2 = CHRTEST_VOID;
4884         assume_ok_easy:
4885
4886             ST.A = scan;
4887             ST.B = next;
4888             PL_reginput = locinput;
4889             if (minmod) {
4890                 minmod = 0;
4891                 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4892                     sayNO;
4893                 ST.count = ST.min;
4894                 locinput = PL_reginput;
4895                 REGCP_SET(ST.cp);
4896                 if (ST.c1 == CHRTEST_VOID)
4897                     goto curly_try_B_min;
4898
4899                 ST.oldloc = locinput;
4900
4901                 /* set ST.maxpos to the furthest point along the
4902                  * string that could possibly match */
4903                 if  (ST.max == REG_INFTY) {
4904                     ST.maxpos = PL_regeol - 1;
4905                     if (do_utf8)
4906                         while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4907                             ST.maxpos--;
4908                 }
4909                 else if (do_utf8) {
4910                     int m = ST.max - ST.min;
4911                     for (ST.maxpos = locinput;
4912                          m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4913                         ST.maxpos += UTF8SKIP(ST.maxpos);
4914                 }
4915                 else {
4916                     ST.maxpos = locinput + ST.max - ST.min;
4917                     if (ST.maxpos >= PL_regeol)
4918                         ST.maxpos = PL_regeol - 1;
4919                 }
4920                 goto curly_try_B_min_known;
4921
4922             }
4923             else {
4924                 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4925                 locinput = PL_reginput;
4926                 if (ST.count < ST.min)
4927                     sayNO;
4928                 if ((ST.count > ST.min)
4929                     && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4930                 {
4931                     /* A{m,n} must come at the end of the string, there's
4932                      * no point in backing off ... */
4933                     ST.min = ST.count;
4934                     /* ...except that $ and \Z can match before *and* after
4935                        newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
4936                        We may back off by one in this case. */
4937                     if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4938                         ST.min--;
4939                 }
4940                 REGCP_SET(ST.cp);
4941                 goto curly_try_B_max;
4942             }
4943             /* NOTREACHED */
4944
4945
4946         case CURLY_B_min_known_fail:
4947             /* failed to find B in a non-greedy match where c1,c2 valid */
4948             if (ST.paren && ST.count)
4949                 PL_regoffs[ST.paren].end = -1;
4950
4951             PL_reginput = locinput;     /* Could be reset... */
4952             REGCP_UNWIND(ST.cp);
4953             /* Couldn't or didn't -- move forward. */
4954             ST.oldloc = locinput;
4955             if (do_utf8)
4956                 locinput += UTF8SKIP(locinput);
4957             else
4958                 locinput++;
4959             ST.count++;
4960           curly_try_B_min_known:
4961              /* find the next place where 'B' could work, then call B */
4962             {
4963                 int n;
4964                 if (do_utf8) {
4965                     n = (ST.oldloc == locinput) ? 0 : 1;
4966                     if (ST.c1 == ST.c2) {
4967                         STRLEN len;
4968                         /* set n to utf8_distance(oldloc, locinput) */
4969                         while (locinput <= ST.maxpos &&
4970                                utf8n_to_uvchr((U8*)locinput,
4971                                               UTF8_MAXBYTES, &len,
4972                                               uniflags) != (UV)ST.c1) {
4973                             locinput += len;
4974                             n++;
4975                         }
4976                     }
4977                     else {
4978                         /* set n to utf8_distance(oldloc, locinput) */
4979                         while (locinput <= ST.maxpos) {
4980                             STRLEN len;
4981                             const UV c = utf8n_to_uvchr((U8*)locinput,
4982                                                   UTF8_MAXBYTES, &len,
4983                                                   uniflags);
4984                             if (c == (UV)ST.c1 || c == (UV)ST.c2)
4985                                 break;
4986                             locinput += len;
4987                             n++;
4988                         }
4989                     }
4990                 }
4991                 else {
4992                     if (ST.c1 == ST.c2) {
4993                         while (locinput <= ST.maxpos &&
4994                                UCHARAT(locinput) != ST.c1)
4995                             locinput++;
4996                     }
4997                     else {
4998                         while (locinput <= ST.maxpos
4999                                && UCHARAT(locinput) != ST.c1
5000                                && UCHARAT(locinput) != ST.c2)
5001                             locinput++;
5002                     }
5003                     n = locinput - ST.oldloc;
5004                 }
5005                 if (locinput > ST.maxpos)
5006                     sayNO;
5007                 /* PL_reginput == oldloc now */
5008                 if (n) {
5009                     ST.count += n;
5010                     if (regrepeat(rex, ST.A, n, depth) < n)
5011                         sayNO;
5012                 }
5013                 PL_reginput = locinput;
5014                 CURLY_SETPAREN(ST.paren, ST.count);
5015                 if (cur_eval && cur_eval->u.eval.close_paren &&
5016                     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5017                     goto fake_end;
5018                 }
5019                 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5020             }
5021             /* NOTREACHED */
5022
5023
5024         case CURLY_B_min_fail:
5025             /* failed to find B in a non-greedy match where c1,c2 invalid */
5026             if (ST.paren && ST.count)
5027                 PL_regoffs[ST.paren].end = -1;
5028
5029             REGCP_UNWIND(ST.cp);
5030             /* failed -- move forward one */
5031             PL_reginput = locinput;
5032             if (regrepeat(rex, ST.A, 1, depth)) {
5033                 ST.count++;
5034                 locinput = PL_reginput;
5035                 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5036                         ST.count > 0)) /* count overflow ? */
5037                 {
5038                   curly_try_B_min:
5039                     CURLY_SETPAREN(ST.paren, ST.count);
5040                     if (cur_eval && cur_eval->u.eval.close_paren &&
5041                         cur_eval->u.eval.close_paren == (U32)ST.paren) {
5042                         goto fake_end;
5043                     }
5044                     PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5045                 }
5046             }
5047             sayNO;
5048             /* NOTREACHED */
5049
5050
5051         curly_try_B_max:
5052             /* a successful greedy match: now try to match B */
5053             if (cur_eval && cur_eval->u.eval.close_paren &&
5054                 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5055                 goto fake_end;
5056             }
5057             {
5058                 UV c = 0;
5059                 if (ST.c1 != CHRTEST_VOID)
5060                     c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
5061                                            UTF8_MAXBYTES, 0, uniflags)
5062                                 : (UV) UCHARAT(PL_reginput);
5063                 /* If it could work, try it. */
5064                 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5065                     CURLY_SETPAREN(ST.paren, ST.count);
5066                     PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5067                     /* NOTREACHED */
5068                 }
5069             }
5070             /* FALL THROUGH */
5071         case CURLY_B_max_fail:
5072             /* failed to find B in a greedy match */
5073             if (ST.paren && ST.count)
5074                 PL_regoffs[ST.paren].end = -1;
5075
5076             REGCP_UNWIND(ST.cp);
5077             /*  back up. */
5078             if (--ST.count < ST.min)
5079                 sayNO;
5080             PL_reginput = locinput = HOPc(locinput, -1);
5081             goto curly_try_B_max;
5082
5083 #undef ST
5084
5085         case END:
5086             fake_end:
5087             if (cur_eval) {
5088                 /* we've just finished A in /(??{A})B/; now continue with B */
5089                 I32 tmpix;
5090                 st->u.eval.toggle_reg_flags
5091                             = cur_eval->u.eval.toggle_reg_flags;
5092                 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5093
5094                 st->u.eval.prev_rex = rex_sv;           /* inner */
5095                 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5096                 rex = (struct regexp *)SvANY(rex_sv);
5097                 rexi = RXi_GET(rex);
5098                 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5099                 ReREFCNT_inc(rex_sv);
5100                 st->u.eval.cp = regcppush(0);   /* Save *all* the positions. */
5101
5102                 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5103                 PL_reglastparen = &rex->lastparen;
5104                 PL_reglastcloseparen = &rex->lastcloseparen;
5105
5106                 REGCP_SET(st->u.eval.lastcp);
5107                 PL_reginput = locinput;
5108
5109                 /* Restore parens of the outer rex without popping the
5110                  * savestack */
5111                 tmpix = PL_savestack_ix;
5112                 PL_savestack_ix = cur_eval->u.eval.lastcp;
5113                 regcppop(rex);
5114                 PL_savestack_ix = tmpix;
5115
5116                 st->u.eval.prev_eval = cur_eval;
5117                 cur_eval = cur_eval->u.eval.prev_eval;
5118                 DEBUG_EXECUTE_r(
5119                     PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5120                                       REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5121                 if ( nochange_depth )
5122                     nochange_depth--;
5123
5124                 PUSH_YES_STATE_GOTO(EVAL_AB,
5125                         st->u.eval.prev_eval->u.eval.B); /* match B */
5126             }
5127
5128             if (locinput < reginfo->till) {
5129                 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5130                                       "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5131                                       PL_colors[4],
5132                                       (long)(locinput - PL_reg_starttry),
5133                                       (long)(reginfo->till - PL_reg_starttry),
5134                                       PL_colors[5]));
5135
5136                 sayNO_SILENT;           /* Cannot match: too short. */
5137             }
5138             PL_reginput = locinput;     /* put where regtry can find it */
5139             sayYES;                     /* Success! */
5140
5141         case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5142             DEBUG_EXECUTE_r(
5143             PerlIO_printf(Perl_debug_log,
5144                 "%*s  %ssubpattern success...%s\n",
5145                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5146             PL_reginput = locinput;     /* put where regtry can find it */
5147             sayYES;                     /* Success! */
5148
5149 #undef  ST
5150 #define ST st->u.ifmatch
5151
5152         case SUSPEND:   /* (?>A) */
5153             ST.wanted = 1;
5154             PL_reginput = locinput;
5155             goto do_ifmatch;
5156
5157         case UNLESSM:   /* -ve lookaround: (?!A), or with flags, (?<!A) */
5158             ST.wanted = 0;
5159             goto ifmatch_trivial_fail_test;
5160
5161         case IFMATCH:   /* +ve lookaround: (?=A), or with flags, (?<=A) */
5162             ST.wanted = 1;
5163           ifmatch_trivial_fail_test:
5164             if (scan->flags) {
5165                 char * const s = HOPBACKc(locinput, scan->flags);
5166                 if (!s) {
5167                     /* trivial fail */
5168                     if (logical) {
5169                         logical = 0;
5170                         sw = 1 - (bool)ST.wanted;
5171                     }
5172                     else if (ST.wanted)
5173                         sayNO;
5174                     next = scan + ARG(scan);
5175                     if (next == scan)
5176                         next = NULL;
5177                     break;
5178                 }
5179                 PL_reginput = s;
5180             }
5181             else
5182                 PL_reginput = locinput;
5183
5184           do_ifmatch:
5185             ST.me = scan;
5186             ST.logical = logical;
5187             logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5188
5189             /* execute body of (?...A) */
5190             PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5191             /* NOTREACHED */
5192
5193         case IFMATCH_A_fail: /* body of (?...A) failed */
5194             ST.wanted = !ST.wanted;
5195             /* FALL THROUGH */
5196
5197         case IFMATCH_A: /* body of (?...A) succeeded */
5198             if (ST.logical) {
5199                 sw = (bool)ST.wanted;
5200             }
5201             else if (!ST.wanted)
5202                 sayNO;
5203
5204             if (OP(ST.me) == SUSPEND)
5205                 locinput = PL_reginput;
5206             else {
5207                 locinput = PL_reginput = st->locinput;
5208                 nextchr = UCHARAT(locinput);
5209             }
5210             scan = ST.me + ARG(ST.me);
5211             if (scan == ST.me)
5212                 scan = NULL;
5213             continue; /* execute B */
5214
5215 #undef ST
5216
5217         case LONGJMP:
5218             next = scan + ARG(scan);
5219             if (next == scan)
5220                 next = NULL;
5221             break;
5222         case COMMIT:
5223             reginfo->cutpoint = PL_regeol;
5224             /* FALLTHROUGH */
5225         case PRUNE:
5226             PL_reginput = locinput;
5227             if (!scan->flags)
5228                 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5229             PUSH_STATE_GOTO(COMMIT_next,next);
5230             /* NOTREACHED */
5231         case COMMIT_next_fail:
5232             no_final = 1;
5233             /* FALLTHROUGH */
5234         case OPFAIL:
5235             sayNO;
5236             /* NOTREACHED */
5237
5238 #define ST st->u.mark
5239         case MARKPOINT:
5240             ST.prev_mark = mark_state;
5241             ST.mark_name = sv_commit = sv_yes_mark
5242                 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5243             mark_state = st;
5244             ST.mark_loc = PL_reginput = locinput;
5245             PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5246             /* NOTREACHED */
5247         case MARKPOINT_next:
5248             mark_state = ST.prev_mark;
5249             sayYES;
5250             /* NOTREACHED */
5251         case MARKPOINT_next_fail:
5252             if (popmark && sv_eq(ST.mark_name,popmark))
5253             {
5254                 if (ST.mark_loc > startpoint)
5255                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5256                 popmark = NULL; /* we found our mark */
5257                 sv_commit = ST.mark_name;
5258
5259                 DEBUG_EXECUTE_r({
5260                         PerlIO_printf(Perl_debug_log,
5261                             "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5262                             REPORT_CODE_OFF+depth*2, "",
5263                             PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5264                 });
5265             }
5266             mark_state = ST.prev_mark;
5267             sv_yes_mark = mark_state ?
5268                 mark_state->u.mark.mark_name : NULL;
5269             sayNO;
5270             /* NOTREACHED */
5271         case SKIP:
5272             PL_reginput = locinput;
5273             if (scan->flags) {
5274                 /* (*SKIP) : if we fail we cut here*/
5275                 ST.mark_name = NULL;
5276                 ST.mark_loc = locinput;
5277                 PUSH_STATE_GOTO(SKIP_next,next);
5278             } else {
5279                 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5280                    otherwise do nothing.  Meaning we need to scan
5281                  */
5282                 regmatch_state *cur = mark_state;
5283                 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5284
5285                 while (cur) {
5286                     if ( sv_eq( cur->u.mark.mark_name,
5287                                 find ) )
5288                     {
5289                         ST.mark_name = find;
5290                         PUSH_STATE_GOTO( SKIP_next, next );
5291                     }
5292                     cur = cur->u.mark.prev_mark;
5293                 }
5294             }
5295             /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5296             break;
5297         case SKIP_next_fail:
5298             if (ST.mark_name) {
5299                 /* (*CUT:NAME) - Set up to search for the name as we
5300                    collapse the stack*/
5301                 popmark = ST.mark_name;
5302             } else {
5303                 /* (*CUT) - No name, we cut here.*/
5304                 if (ST.mark_loc > startpoint)
5305                     reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5306                 /* but we set sv_commit to latest mark_name if there
5307                    is one so they can test to see how things lead to this
5308                    cut */
5309                 if (mark_state)
5310                     sv_commit=mark_state->u.mark.mark_name;
5311             }
5312             no_final = 1;
5313             sayNO;
5314             /* NOTREACHED */
5315 #undef ST
5316         case FOLDCHAR:
5317             n = ARG(scan);
5318             if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5319                 locinput += ln;
5320             } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5321                 sayNO;
5322             } else  {
5323                 U8 folded[UTF8_MAXBYTES_CASE+1];
5324                 STRLEN foldlen;
5325                 const char * const l = locinput;
5326                 char *e = PL_regeol;
5327                 to_uni_fold(n, folded, &foldlen);
5328
5329                 if (ibcmp_utf8((const char*) folded, 0,  foldlen, 1,
5330                                l, &e, 0,  do_utf8)) {
5331                         sayNO;
5332                 }
5333                 locinput = e;
5334             }
5335             nextchr = UCHARAT(locinput);
5336             break;
5337         case LNBREAK:
5338             if ((n=is_LNBREAK(locinput,do_utf8))) {
5339                 locinput += n;
5340                 nextchr = UCHARAT(locinput);
5341             } else
5342                 sayNO;
5343             break;
5344
5345 #define CASE_CLASS(nAmE)                              \
5346         case nAmE:                                    \
5347             if ((n=is_##nAmE(locinput,do_utf8))) {    \
5348                 locinput += n;                        \
5349                 nextchr = UCHARAT(locinput);          \
5350             } else                                    \
5351                 sayNO;                                \
5352             break;                                    \
5353         case N##nAmE:                                 \
5354             if ((n=is_##nAmE(locinput,do_utf8))) {    \
5355                 sayNO;                                \
5356             } else {                                  \
5357                 locinput += UTF8SKIP(locinput);       \
5358                 nextchr = UCHARAT(locinput);          \
5359             }                                         \
5360             break
5361
5362         CASE_CLASS(VERTWS);
5363         CASE_CLASS(HORIZWS);
5364 #undef CASE_CLASS
5365
5366         default:
5367             PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5368                           PTR2UV(scan), OP(scan));
5369             Perl_croak(aTHX_ "regexp memory corruption");
5370
5371         } /* end switch */
5372
5373         /* switch break jumps here */
5374         scan = next; /* prepare to execute the next op and ... */
5375         continue;    /* ... jump back to the top, reusing st */
5376         /* NOTREACHED */
5377
5378       push_yes_state:
5379         /* push a state that backtracks on success */
5380         st->u.yes.prev_yes_state = yes_state;
5381         yes_state = st;
5382         /* FALL THROUGH */
5383       push_state:
5384         /* push a new regex state, then continue at scan  */
5385         {
5386             regmatch_state *newst;
5387
5388             DEBUG_STACK_r({
5389                 regmatch_state *cur = st;
5390                 regmatch_state *curyes = yes_state;
5391                 int curd = depth;
5392                 regmatch_slab *slab = PL_regmatch_slab;
5393                 for (;curd > -1;cur--,curd--) {
5394                     if (cur < SLAB_FIRST(slab)) {
5395                         slab = slab->prev;
5396                         cur = SLAB_LAST(slab);
5397                     }
5398                     PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5399                         REPORT_CODE_OFF + 2 + depth * 2,"",
5400                         curd, PL_reg_name[cur->resume_state],
5401                         (curyes == cur) ? "yes" : ""
5402                     );
5403                     if (curyes == cur)
5404                         curyes = cur->u.yes.prev_yes_state;
5405                 }
5406             } else
5407                 DEBUG_STATE_pp("push")
5408             );
5409             depth++;
5410             st->locinput = locinput;
5411             newst = st+1;
5412             if (newst >  SLAB_LAST(PL_regmatch_slab))
5413                 newst = S_push_slab(aTHX);
5414             PL_regmatch_state = newst;
5415
5416             locinput = PL_reginput;
5417             nextchr = UCHARAT(locinput);
5418             st = newst;
5419             continue;
5420             /* NOTREACHED */
5421         }
5422     }
5423
5424     /*
5425     * We get here only if there's trouble -- normally "case END" is
5426     * the terminating point.
5427     */
5428     Perl_croak(aTHX_ "corrupted regexp pointers");
5429     /*NOTREACHED*/
5430     sayNO;
5431
5432 yes:
5433     if (yes_state) {
5434         /* we have successfully completed a subexpression, but we must now
5435          * pop to the state marked by yes_state and continue from there */
5436         assert(st != yes_state);
5437 #ifdef DEBUGGING
5438         while (st != yes_state) {
5439             st--;
5440             if (st < SLAB_FIRST(PL_regmatch_slab)) {
5441                 PL_regmatch_slab = PL_regmatch_slab->prev;
5442                 st = SLAB_LAST(PL_regmatch_slab);
5443             }
5444             DEBUG_STATE_r({
5445                 if (no_final) {
5446                     DEBUG_STATE_pp("pop (no final)");
5447                 } else {
5448                     DEBUG_STATE_pp("pop (yes)");
5449                 }
5450             });
5451             depth--;
5452         }
5453 #else
5454         while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5455             || yes_state > SLAB_LAST(PL_regmatch_slab))
5456         {
5457             /* not in this slab, pop slab */
5458             depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5459             PL_regmatch_slab = PL_regmatch_slab->prev;
5460             st = SLAB_LAST(PL_regmatch_slab);
5461         }
5462         depth -= (st - yes_state);
5463 #endif
5464         st = yes_state;
5465         yes_state = st->u.yes.prev_yes_state;
5466         PL_regmatch_state = st;
5467
5468         if (no_final) {
5469             locinput= st->locinput;
5470             nextchr = UCHARAT(locinput);
5471         }
5472         state_num = st->resume_state + no_final;
5473         goto reenter_switch;
5474     }
5475
5476     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5477                           PL_colors[4], PL_colors[5]));
5478
5479     if (PL_reg_eval_set) {
5480         /* each successfully executed (?{...}) block does the equivalent of
5481          *   local $^R = do {...}
5482          * When popping the save stack, all these locals would be undone;
5483          * bypass this by setting the outermost saved $^R to the latest
5484          * value */
5485         if (oreplsv != GvSV(PL_replgv))
5486             sv_setsv(oreplsv, GvSV(PL_replgv));
5487     }
5488     result = 1;
5489     goto final_exit;
5490
5491 no:
5492     DEBUG_EXECUTE_r(
5493         PerlIO_printf(Perl_debug_log,
5494             "%*s  %sfailed...%s\n",
5495             REPORT_CODE_OFF+depth*2, "",
5496             PL_colors[4], PL_colors[5])
5497         );
5498
5499 no_silent:
5500     if (no_final) {
5501         if (yes_state) {
5502             goto yes;
5503         } else {
5504             goto final_exit;
5505         }
5506     }
5507     if (depth) {
5508         /* there's a previous state to backtrack to */
5509         st--;
5510         if (st < SLAB_FIRST(PL_regmatch_slab)) {
5511             PL_regmatch_slab = PL_regmatch_slab->prev;
5512             st = SLAB_LAST(PL_regmatch_slab);
5513         }
5514         PL_regmatch_state = st;
5515         locinput= st->locinput;
5516         nextchr = UCHARAT(locinput);
5517
5518         DEBUG_STATE_pp("pop");
5519         depth--;
5520         if (yes_state == st)
5521             yes_state = st->u.yes.prev_yes_state;
5522
5523         state_num = st->resume_state + 1; /* failure = success + 1 */
5524         goto reenter_switch;
5525     }
5526     result = 0;
5527
5528   final_exit:
5529     if (rex->intflags & PREGf_VERBARG_SEEN) {
5530         SV *sv_err = get_sv("REGERROR", 1);
5531         SV *sv_mrk = get_sv("REGMARK", 1);
5532         if (result) {
5533             sv_commit = &PL_sv_no;
5534             if (!sv_yes_mark)
5535                 sv_yes_mark = &PL_sv_yes;
5536         } else {
5537             if (!sv_commit)
5538                 sv_commit = &PL_sv_yes;
5539             sv_yes_mark = &PL_sv_no;
5540         }
5541         sv_setsv(sv_err, sv_commit);
5542         sv_setsv(sv_mrk, sv_yes_mark);
5543     }
5544
5545     /* clean up; in particular, free all slabs above current one */
5546     LEAVE_SCOPE(oldsave);
5547
5548     return result;
5549 }
5550
5551 /*
5552  - regrepeat - repeatedly match something simple, report how many
5553  */
5554 /*
5555  * [This routine now assumes that it will only match on things of length 1.
5556  * That was true before, but now we assume scan - reginput is the count,
5557  * rather than incrementing count on every character.  [Er, except utf8.]]
5558  */
5559 STATIC I32
5560 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5561 {
5562     dVAR;
5563     register char *scan;
5564     register I32 c;
5565     register char *loceol = PL_regeol;
5566     register I32 hardcount = 0;
5567     register bool do_utf8 = PL_reg_match_utf8;
5568 #ifndef DEBUGGING
5569     PERL_UNUSED_ARG(depth);
5570 #endif
5571
5572     PERL_ARGS_ASSERT_REGREPEAT;
5573
5574     scan = PL_reginput;
5575     if (max == REG_INFTY)
5576         max = I32_MAX;
5577     else if (max < loceol - scan)
5578         loceol = scan + max;
5579     switch (OP(p)) {
5580     case REG_ANY:
5581         if (do_utf8) {
5582             loceol = PL_regeol;
5583             while (scan < loceol && hardcount < max && *scan != '\n') {
5584                 scan += UTF8SKIP(scan);
5585                 hardcount++;
5586             }
5587         } else {
5588             while (scan < loceol && *scan != '\n')
5589                 scan++;
5590         }
5591         break;
5592     case SANY:
5593         if (do_utf8) {
5594             loceol = PL_regeol;
5595             while (scan < loceol && hardcount < max) {
5596                 scan += UTF8SKIP(scan);
5597                 hardcount++;
5598             }
5599         }
5600         else
5601             scan = loceol;
5602         break;
5603     case CANY:
5604         scan = loceol;
5605         break;
5606     case EXACT:         /* length of string is 1 */
5607         c = (U8)*STRING(p);
5608         while (scan < loceol && UCHARAT(scan) == c)
5609             scan++;
5610         break;
5611     case EXACTF:        /* length of string is 1 */
5612         c = (U8)*STRING(p);
5613         while (scan < loceol &&
5614                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5615             scan++;
5616         break;
5617     case EXACTFL:       /* length of string is 1 */
5618         PL_reg_flags |= RF_tainted;
5619         c = (U8)*STRING(p);
5620         while (scan < loceol &&
5621                (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5622             scan++;
5623         break;
5624     case ANYOF:
5625         if (do_utf8) {
5626             loceol = PL_regeol;
5627             while (hardcount < max && scan < loceol &&
5628                    reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5629                 scan += UTF8SKIP(scan);
5630                 hardcount++;
5631             }
5632         } else {
5633             while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5634                 scan++;
5635         }
5636         break;
5637     case ALNUM:
5638         if (do_utf8) {
5639             loceol = PL_regeol;
5640             LOAD_UTF8_CHARCLASS_ALNUM();
5641             while (hardcount < max && scan < loceol &&
5642                    swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5643                 scan += UTF8SKIP(scan);
5644                 hardcount++;
5645             }
5646         } else {
5647             while (scan < loceol && isALNUM(*scan))
5648                 scan++;
5649         }
5650         break;
5651     case ALNUML:
5652         PL_reg_flags |= RF_tainted;
5653         if (do_utf8) {
5654             loceol = PL_regeol;
5655             while (hardcount < max && scan < loceol &&
5656                    isALNUM_LC_utf8((U8*)scan)) {
5657                 scan += UTF8SKIP(scan);
5658                 hardcount++;
5659             }
5660         } else {
5661             while (scan < loceol && isALNUM_LC(*scan))
5662                 scan++;
5663         }
5664         break;
5665     case NALNUM:
5666         if (do_utf8) {
5667             loceol = PL_regeol;
5668             LOAD_UTF8_CHARCLASS_ALNUM();
5669             while (hardcount < max && scan < loceol &&
5670                    !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5671                 scan += UTF8SKIP(scan);
5672                 hardcount++;
5673             }
5674         } else {
5675             while (scan < loceol && !isALNUM(*scan))
5676                 scan++;
5677         }
5678         break;
5679     case NALNUML:
5680         PL_reg_flags |= RF_tainted;
5681         if (do_utf8) {
5682             loceol = PL_regeol;
5683             while (hardcount < max && scan < loceol &&
5684                    !isALNUM_LC_utf8((U8*)scan)) {
5685                 scan += UTF8SKIP(scan);
5686                 hardcount++;
5687             }
5688         } else {
5689             while (scan < loceol && !isALNUM_LC(*scan))
5690                 scan++;
5691         }
5692         break;
5693     case SPACE:
5694         if (do_utf8) {
5695             loceol = PL_regeol;
5696             LOAD_UTF8_CHARCLASS_SPACE();
5697             while (hardcount < max && scan < loceol &&
5698                    (*scan == ' ' ||
5699                     swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5700                 scan += UTF8SKIP(scan);
5701                 hardcount++;
5702             }
5703         } else {
5704             while (scan < loceol && isSPACE(*scan))
5705                 scan++;
5706         }
5707         break;
5708     case SPACEL:
5709         PL_reg_flags |= RF_tainted;
5710         if (do_utf8) {
5711             loceol = PL_regeol;
5712             while (hardcount < max && scan < loceol &&
5713                    (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5714                 scan += UTF8SKIP(scan);
5715                 hardcount++;
5716             }
5717         } else {
5718             while (scan < loceol && isSPACE_LC(*scan))
5719                 scan++;
5720         }
5721         break;
5722     case NSPACE:
5723         if (do_utf8) {
5724             loceol = PL_regeol;
5725             LOAD_UTF8_CHARCLASS_SPACE();
5726             while (hardcount < max && scan < loceol &&
5727                    !(*scan == ' ' ||
5728                      swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5729                 scan += UTF8SKIP(scan);
5730                 hardcount++;
5731             }
5732         } else {
5733             while (scan < loceol && !isSPACE(*scan))
5734                 scan++;
5735         }
5736         break;
5737     case NSPACEL:
5738         PL_reg_flags |= RF_tainted;
5739         if (do_utf8) {
5740             loceol = PL_regeol;
5741             while (hardcount < max && scan < loceol &&
5742                    !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5743                 scan += UTF8SKIP(scan);
5744                 hardcount++;
5745             }
5746         } else {
5747             while (scan < loceol && !isSPACE_LC(*scan))
5748                 scan++;
5749         }
5750         break;
5751     case DIGIT:
5752         if (do_utf8) {
5753             loceol = PL_regeol;
5754             LOAD_UTF8_CHARCLASS_DIGIT();
5755             while (hardcount < max && scan < loceol &&
5756                    swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5757                 scan += UTF8SKIP(scan);
5758                 hardcount++;
5759             }
5760         } else {
5761             while (scan < loceol && isDIGIT(*scan))
5762                 scan++;
5763         }
5764         break;
5765     case NDIGIT:
5766         if (do_utf8) {
5767             loceol = PL_regeol;
5768             LOAD_UTF8_CHARCLASS_DIGIT();
5769             while (hardcount < max && scan < loceol &&
5770                    !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5771                 scan += UTF8SKIP(scan);
5772                 hardcount++;
5773             }
5774         } else {
5775             while (scan < loceol && !isDIGIT(*scan))
5776                 scan++;
5777         }
5778     case LNBREAK:
5779         if (do_utf8) {
5780             loceol = PL_regeol;
5781             while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5782                 scan += c;
5783                 hardcount++;
5784             }
5785         } else {
5786             /*
5787               LNBREAK can match two latin chars, which is ok,
5788               because we have a null terminated string, but we
5789               have to use hardcount in this situation
5790             */
5791             while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
5792                 scan+=c;
5793                 hardcount++;
5794             }
5795         }
5796         break;
5797     case HORIZWS:
5798         if (do_utf8) {
5799             loceol = PL_regeol;
5800             while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5801                 scan += c;
5802                 hardcount++;
5803             }
5804         } else {
5805             while (scan < loceol && is_HORIZWS_latin1(scan))
5806                 scan++;
5807         }
5808         break;
5809     case NHORIZWS:
5810         if (do_utf8) {
5811             loceol = PL_regeol;
5812             while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5813                 scan += UTF8SKIP(scan);
5814                 hardcount++;
5815             }
5816         } else {
5817             while (scan < loceol && !is_HORIZWS_latin1(scan))
5818                 scan++;
5819
5820         }
5821         break;
5822     case VERTWS:
5823         if (do_utf8) {
5824             loceol = PL_regeol;
5825             while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5826                 scan += c;
5827                 hardcount++;
5828             }
5829         } else {
5830             while (scan < loceol && is_VERTWS_latin1(scan))
5831                 scan++;
5832
5833         }
5834         break;
5835     case NVERTWS:
5836         if (do_utf8) {
5837             loceol = PL_regeol;
5838             while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5839                 scan += UTF8SKIP(scan);
5840                 hardcount++;
5841             }
5842         } else {
5843             while (scan < loceol && !is_VERTWS_latin1(scan))
5844                 scan++;
5845
5846         }
5847         break;
5848
5849     default:            /* Called on something of 0 width. */
5850         break;          /* So match right here or not at all. */
5851     }
5852
5853     if (hardcount)
5854         c = hardcount;
5855     else
5856         c = scan - PL_reginput;
5857     PL_reginput = scan;
5858
5859     DEBUG_r({
5860         GET_RE_DEBUG_FLAGS_DECL;
5861         DEBUG_EXECUTE_r({
5862             SV * const prop = sv_newmortal();
5863             regprop(prog, prop, p);
5864             PerlIO_printf(Perl_debug_log,
5865                         "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
5866                         REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5867         });
5868     });
5869
5870     return(c);
5871 }
5872
5873
5874 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5875 /*
5876 - regclass_swash - prepare the utf8 swash
5877 */
5878
5879 SV *
5880 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5881 {
5882     dVAR;
5883     SV *sw  = NULL;
5884     SV *si  = NULL;
5885     SV *alt = NULL;
5886     RXi_GET_DECL(prog,progi);
5887     const struct reg_data * const data = prog ? progi->data : NULL;
5888
5889     PERL_ARGS_ASSERT_REGCLASS_SWASH;
5890
5891     if (data && data->count) {
5892         const U32 n = ARG(node);
5893
5894         if (data->what[n] == 's') {
5895             SV * const rv = MUTABLE_SV(data->data[n]);
5896             AV * const av = MUTABLE_AV(SvRV(rv));
5897             SV **const ary = AvARRAY(av);
5898             SV **a, **b;
5899
5900             /* See the end of regcomp.c:S_regclass() for
5901              * documentation of these array elements. */
5902
5903             si = *ary;
5904             a  = SvROK(ary[1]) ? &ary[1] : NULL;
5905             b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5906
5907             if (a)
5908                 sw = *a;
5909             else if (si && doinit) {
5910                 sw = swash_init("utf8", "", si, 1, 0);
5911                 (void)av_store(av, 1, sw);
5912             }
5913             if (b)
5914                 alt = *b;
5915         }
5916     }
5917
5918     if (listsvp)
5919         *listsvp = si;
5920     if (altsvp)
5921         *altsvp  = alt;
5922
5923     return sw;
5924 }
5925 #endif
5926
5927 /*
5928  - reginclass - determine if a character falls into a character class
5929
5930   The n is the ANYOF regnode, the p is the target string, lenp
5931   is pointer to the maximum length of how far to go in the p
5932   (if the lenp is zero, UTF8SKIP(p) is used),
5933   do_utf8 tells whether the target string is in UTF-8.
5934
5935  */
5936
5937 STATIC bool
5938 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5939 {
5940     dVAR;
5941     const char flags = ANYOF_FLAGS(n);
5942     bool match = FALSE;
5943     UV c = *p;
5944     STRLEN len = 0;
5945     STRLEN plen;
5946
5947     PERL_ARGS_ASSERT_REGINCLASS;
5948
5949     if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5950         c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5951                 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
5952                 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
5953                 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
5954                  * UTF8_ALLOW_FFFF */
5955         if (len == (STRLEN)-1)
5956             Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5957     }
5958
5959     plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5960     if (do_utf8 || (flags & ANYOF_UNICODE)) {
5961         if (lenp)
5962             *lenp = 0;
5963         if (do_utf8 && !ANYOF_RUNTIME(n)) {
5964             if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5965                 match = TRUE;
5966         }
5967         if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5968             match = TRUE;
5969         if (!match) {
5970             AV *av;
5971             SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5972
5973             if (sw) {
5974                 U8 * utf8_p;
5975                 if (do_utf8) {
5976                     utf8_p = (U8 *) p;
5977                 } else {
5978                     STRLEN len = 1;
5979                     utf8_p = bytes_to_utf8(p, &len);
5980                 }
5981                 if (swash_fetch(sw, utf8_p, 1))
5982                     match = TRUE;
5983                 else if (flags & ANYOF_FOLD) {
5984                     if (!match && lenp && av) {
5985                         I32 i;
5986                         for (i = 0; i <= av_len(av); i++) {
5987                             SV* const sv = *av_fetch(av, i, FALSE);
5988                             STRLEN len;
5989                             const char * const s = SvPV_const(sv, len);
5990                             if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
5991                                 *lenp = len;
5992                                 match = TRUE;
5993                                 break;
5994                             }
5995                         }
5996                     }
5997                     if (!match) {
5998                         U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5999
6000                         STRLEN tmplen;
6001                         to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6002                         if (swash_fetch(sw, tmpbuf, 1))
6003                             match = TRUE;
6004                     }
6005                 }
6006
6007                 /* If we allocated a string above, free it */
6008                 if (! do_utf8) Safefree(utf8_p);
6009             }
6010         }
6011         if (match && lenp && *lenp == 0)
6012             *lenp = UNISKIP(NATIVE_TO_UNI(c));
6013     }
6014     if (!match && c < 256) {
6015         if (ANYOF_BITMAP_TEST(n, c))
6016             match = TRUE;
6017         else if (flags & ANYOF_FOLD) {
6018             U8 f;
6019
6020             if (flags & ANYOF_LOCALE) {
6021                 PL_reg_flags |= RF_tainted;
6022                 f = PL_fold_locale[c];
6023             }
6024             else
6025                 f = PL_fold[c];
6026             if (f != c && ANYOF_BITMAP_TEST(n, f))
6027                 match = TRUE;
6028         }
6029
6030         if (!match && (flags & ANYOF_CLASS)) {
6031             PL_reg_flags |= RF_tainted;
6032             if (
6033                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6034                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6035                 (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6036                 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6037                 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6038                 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6039                 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6040                 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6041                 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6042                 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6043                 (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6044                 (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6045                 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6046                 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6047                 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6048                 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6049                 (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6050                 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6051                 (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6052                 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6053                 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6054                 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6055                 (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6056                 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6057                 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6058                 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6059                 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6060                 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6061                 (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6062                 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6063                 ) /* How's that for a conditional? */
6064             {
6065                 match = TRUE;
6066             }
6067         }
6068     }
6069
6070     return (flags & ANYOF_INVERT) ? !match : match;
6071 }
6072
6073 STATIC U8 *
6074 S_reghop3(U8 *s, I32 off, const U8* lim)
6075 {
6076     dVAR;
6077
6078     PERL_ARGS_ASSERT_REGHOP3;
6079
6080     if (off >= 0) {
6081         while (off-- && s < lim) {
6082             /* XXX could check well-formedness here */
6083             s += UTF8SKIP(s);
6084         }
6085     }
6086     else {
6087         while (off++ && s > lim) {
6088             s--;
6089             if (UTF8_IS_CONTINUED(*s)) {
6090                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6091                     s--;
6092             }
6093             /* XXX could check well-formedness here */
6094         }
6095     }
6096     return s;
6097 }
6098
6099 #ifdef XXX_dmq
6100 /* there are a bunch of places where we use two reghop3's that should
6101    be replaced with this routine. but since thats not done yet
6102    we ifdef it out - dmq
6103 */
6104 STATIC U8 *
6105 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6106 {
6107     dVAR;
6108
6109     PERL_ARGS_ASSERT_REGHOP4;
6110
6111     if (off >= 0) {
6112         while (off-- && s < rlim) {
6113             /* XXX could check well-formedness here */
6114             s += UTF8SKIP(s);
6115         }
6116     }
6117     else {
6118         while (off++ && s > llim) {
6119             s--;
6120             if (UTF8_IS_CONTINUED(*s)) {
6121                 while (s > llim && UTF8_IS_CONTINUATION(*s))
6122                     s--;
6123             }
6124             /* XXX could check well-formedness here */
6125         }
6126     }
6127     return s;
6128 }
6129 #endif
6130
6131 STATIC U8 *
6132 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6133 {
6134     dVAR;
6135
6136     PERL_ARGS_ASSERT_REGHOPMAYBE3;
6137
6138     if (off >= 0) {
6139         while (off-- && s < lim) {
6140             /* XXX could check well-formedness here */
6141             s += UTF8SKIP(s);
6142         }
6143         if (off >= 0)
6144             return NULL;
6145     }
6146     else {
6147         while (off++ && s > lim) {
6148             s--;
6149             if (UTF8_IS_CONTINUED(*s)) {
6150                 while (s > lim && UTF8_IS_CONTINUATION(*s))
6151                     s--;
6152             }
6153             /* XXX could check well-formedness here */
6154         }
6155         if (off <= 0)
6156             return NULL;
6157     }
6158     return s;
6159 }
6160
6161 static void
6162 restore_pos(pTHX_ void *arg)
6163 {
6164     dVAR;
6165     regexp * const rex = (regexp *)arg;
6166     if (PL_reg_eval_set) {
6167         if (PL_reg_oldsaved) {
6168             rex->subbeg = PL_reg_oldsaved;
6169             rex->sublen = PL_reg_oldsavedlen;
6170 #ifdef PERL_OLD_COPY_ON_WRITE
6171             rex->saved_copy = PL_nrs;
6172 #endif
6173             RXp_MATCH_COPIED_on(rex);
6174         }
6175         PL_reg_magic->mg_len = PL_reg_oldpos;
6176         PL_reg_eval_set = 0;
6177         PL_curpm = PL_reg_oldcurpm;
6178     }
6179 }
6180
6181 STATIC void
6182 S_to_utf8_substr(pTHX_ register regexp *prog)
6183 {
6184     int i = 1;
6185
6186     PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6187
6188     do {
6189         if (prog->substrs->data[i].substr
6190             && !prog->substrs->data[i].utf8_substr) {
6191             SV* const sv = newSVsv(prog->substrs->data[i].substr);
6192             prog->substrs->data[i].utf8_substr = sv;
6193             sv_utf8_upgrade(sv);
6194             if (SvVALID(prog->substrs->data[i].substr)) {
6195                 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6196                 if (flags & FBMcf_TAIL) {
6197                     /* Trim the trailing \n that fbm_compile added last
6198                        time.  */
6199                     SvCUR_set(sv, SvCUR(sv) - 1);
6200                     /* Whilst this makes the SV technically "invalid" (as its
6201                        buffer is no longer followed by "\0") when fbm_compile()
6202                        adds the "\n" back, a "\0" is restored.  */
6203                 }
6204                 fbm_compile(sv, flags);
6205             }
6206             if (prog->substrs->data[i].substr == prog->check_substr)
6207                 prog->check_utf8 = sv;
6208         }
6209     } while (i--);
6210 }
6211
6212 STATIC void
6213 S_to_byte_substr(pTHX_ register regexp *prog)
6214 {
6215     dVAR;
6216     int i = 1;
6217
6218     PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6219
6220     do {
6221         if (prog->substrs->data[i].utf8_substr
6222             && !prog->substrs->data[i].substr) {
6223             SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6224             if (sv_utf8_downgrade(sv, TRUE)) {
6225                 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6226                     const U8 flags
6227                         = BmFLAGS(prog->substrs->data[i].utf8_substr);
6228                     if (flags & FBMcf_TAIL) {
6229                         /* Trim the trailing \n that fbm_compile added last
6230                            time.  */
6231                         SvCUR_set(sv, SvCUR(sv) - 1);
6232                     }
6233                     fbm_compile(sv, flags);
6234                 }
6235             } else {
6236                 SvREFCNT_dec(sv);
6237                 sv = &PL_sv_undef;
6238             }
6239             prog->substrs->data[i].substr = sv;
6240             if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6241                 prog->check_substr = sv;
6242         }
6243     } while (i--);
6244 }
6245
6246 /*
6247  * Local variables:
6248  * c-indentation-style: bsd
6249  * c-basic-offset: 4
6250  * indent-tabs-mode: t
6251  * End:
6252  *
6253  * ex: set ts=8 sts=4 sw=4 noet:
6254  */