src/5014003/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *  One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  * Copyright (c) 1986 by University of Toronto.
  44  * Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  * Permission is granted to anyone to use this software for any
  47  * purpose on any computer system, and to redistribute it freely,
  48  * subject to the following restrictions:
  49  *
  50  * 1. The author is not responsible for the consequences of use of
  51  *  this software, no matter how awful, even if they arise
  52  *  from defects in it.
  53  *
  54  * 2. The origin of this software must not be misrepresented, either
  55  *  by explicit claim or by omission.
  56  *
  57  * 3. Altered versions must be plainly marked as such, and must not
  58  *  be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted 1 /* tainted information used? e.g. locale */
  85 #define RF_warned 2  /* warned about big count? */
  86
  87 #define RF_utf8  8  /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init  1  /* eval environment created */
  92 #define RS_set  2  /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC static
  96 #endif
  97
  98 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  99  * call if there are no complications: i.e., if everything matchable is
 100  * straight forward in the bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102            : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112   (char *)(PL_reg_match_utf8 \
 113    ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114    : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116   (char*)(PL_reg_match_utf8\
 117    ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118    : (pos - off >= PL_bostr)  \
 119     ? (U8*)pos - off  \
 120     : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 127  if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 128
 129 /* Doesn't do an assert to verify that is correct */
 130 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 131  if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 132
 133 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 134 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 135 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 136
 137 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 138   LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 139   LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 140   /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 141    * assert should likely and hopefully fail on an EBCDIC machine */ \
 142   LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 143                    \
 144   /* No asserts are done for these, in case called on an early        \
 145    * Unicode version in which they map to nothing */               \
 146   LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 147   LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);     /* U+1100 "\xe1\x84\x80" */ \
 148   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 149   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 150   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 151   LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 152   LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 153
 154 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
 155
 156 /* The actual code for CCC_TRY, which uses several variables from the routine
 157  * it's callable from.  It is designed to be the bulk of a case statement.
 158  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 159  *      nextchr matches the class.
 160  * UTF8_TEST is the whole test string to use for utf8 targets
 161  * LOAD is what to use to test, and if not present to load in the swash for the
 162  * class
 163  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 164  * UTF8_TEST test.
 165  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 166  * utf8 and a variant, load the swash if necessary and test using the utf8
 167  * test.  Advance to the next character if test is ok, otherwise fail; If not
 168  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 169  * fails, or advance to the next character */
 170
 171 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 172  if (locinput >= PL_regeol) {                                              \
 173   sayNO;                                                                \
 174  }                                                                         \
 175  if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 176   LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 177   if (POS_OR_NEG (UTF8_TEST)) {                                         \
 178    sayNO;                                                            \
 179   }                                                                     \
 180   locinput += PL_utf8skip[nextchr];                                     \
 181   nextchr = UCHARAT(locinput);                                          \
 182   break;                                                                \
 183  }                                                                         \
 184  if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 185   sayNO;                                                                \
 186  }                                                                         \
 187  nextchr = UCHARAT(++locinput);                                            \
 188  break;
 189
 190 /* Handle the non-locale cases for a character class and its complement.  It
 191  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 192  * This is because that code fails when the test succeeds, so we want to have
 193  * the test fail so that the code succeeds.  The swash is stored in a
 194  * predictable PL_ place */
 195 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 196       CLASS, STR)                                        \
 197  case NAME:                                                                \
 198   _CCC_TRY_CODE( !, FUNC,                                               \
 199       cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 200            (U8*)locinput, TRUE)),            \
 201       CLASS, STR)                                         \
 202  case NNAME:                                                               \
 203   _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 204       cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 205            (U8*)locinput, TRUE)),            \
 206       CLASS, STR)                                         \
 207
 208 /* Generate the case statements for both locale and non-locale character
 209  * classes in regmatch for classes that don't have special unicode semantics.
 210  * Locales don't use an immediate swash, but an intermediary special locale
 211  * function that is called on the pointer to the current place in the input
 212  * string.  That function will resolve to needing the same swash.  One might
 213  * think that because we don't know what the locale will match, we shouldn't
 214  * check with the swash loading function that it loaded properly; ie, that we
 215  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 216  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 217  * irrelevant here */
 218 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 219     NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 220     NAMEA, NNAMEA, FUNCA,                                         \
 221     CLASS, STR)                                                   \
 222  case NAMEL:                                                               \
 223   PL_reg_flags |= RF_tainted;                                           \
 224   _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 225  case NNAMEL:                                                              \
 226   PL_reg_flags |= RF_tainted;                                           \
 227   _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 228      CLASS, STR)                                            \
 229  case NAMEA:                                                               \
 230   if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 231    sayNO;                                                            \
 232   }                                                                     \
 233   /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 234   nextchr = UCHARAT(++locinput);                                        \
 235   break;                                                                \
 236  case NNAMEA:                                                              \
 237   if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 238    sayNO;                                                            \
 239   }                                                                     \
 240   if (utf8_target) {                                                    \
 241    locinput += PL_utf8skip[nextchr];                                 \
 242    nextchr = UCHARAT(locinput);                                      \
 243   }                                                                     \
 244   else {                                                                \
 245    nextchr = UCHARAT(++locinput);                                    \
 246   }                                                                     \
 247   break;                                                                \
 248  /* Generate the non-locale cases */                                       \
 249  _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 250
 251 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 252  * statements to handle separate Unicode semantics nodes */
 253 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 254     NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 255     NAMEU, NNAMEU, FUNCU,                                        \
 256     NAMEA, NNAMEA, FUNCA,                                        \
 257     CLASS, STR)                                                  \
 258  CCC_TRY(NAME, NNAME, FUNC,                                                 \
 259    NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 260    NAMEA, NNAMEA, FUNCA,                                              \
 261    CLASS, STR)                                                        \
 262  _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 263
 264 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 265
 266 /* for use after a quantifier and before an EXACT-like node -- japhy */
 267 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 268  *
 269  * NOTE that *nothing* that affects backtracking should be in here, specifically
 270  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 271  * node that is in between two EXACT like nodes when ascertaining what the required
 272  * "follow" character is. This should probably be moved to regex compile time
 273  * although it may be done at run time beause of the REF possibility - more
 274  * investigation required. -- demerphq
 275 */
 276 #define JUMPABLE(rn) (      \
 277  OP(rn) == OPEN ||       \
 278  (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 279  OP(rn) == EVAL ||   \
 280  OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 281  OP(rn) == PLUS || OP(rn) == MINMOD || \
 282  OP(rn) == KEEPS || \
 283  (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 284 )
 285 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 286
 287 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 288
 289 #if 0
 290 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 291    we don't need this definition. */
 292 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 293 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 294 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 295
 296 #else
 297 /* ... so we use this as its faster. */
 298 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 299 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 300 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 301 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 302
 303 #endif
 304
 305 /*
 306   Search for mandatory following text node; for lookahead, the text must
 307   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 308 */
 309 #define FIND_NEXT_IMPT(rn) STMT_START { \
 310  while (JUMPABLE(rn)) { \
 311   const OPCODE type = OP(rn); \
 312   if (type == SUSPEND || PL_regkind[type] == CURLY) \
 313    rn = NEXTOPER(NEXTOPER(rn)); \
 314   else if (type == PLUS) \
 315    rn = NEXTOPER(rn); \
 316   else if (type == IFMATCH) \
 317    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 318   else rn += NEXT_OFF(rn); \
 319  } \
 320 } STMT_END
 321
 322
 323 static void restore_pos(pTHX_ void *arg);
 324
 325 #define REGCP_PAREN_ELEMS 4
 326 #define REGCP_OTHER_ELEMS 5
 327 #define REGCP_FRAME_ELEMS 1
 328 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 329  * are needed for the regexp context stack bookkeeping. */
 330
 331 STATIC CHECKPOINT
 332 S_regcppush(pTHX_ I32 parenfloor)
 333 {
 334  dVAR;
 335  const int retval = PL_savestack_ix;
 336  const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 337  const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 338  const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 339  int p;
 340  GET_RE_DEBUG_FLAGS_DECL;
 341
 342  if (paren_elems_to_push < 0)
 343   Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 344
 345  if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 346   Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 347     " out of range (%lu-%ld)",
 348     total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 349
 350  SSGROW(total_elems + REGCP_FRAME_ELEMS);
 351
 352  for (p = PL_regsize; p > parenfloor; p--) {
 353 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 354   SSPUSHINT(PL_regoffs[p].end);
 355   SSPUSHINT(PL_regoffs[p].start);
 356   SSPUSHPTR(PL_reg_start_tmp[p]);
 357   SSPUSHINT(p);
 358   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 359   "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 360      (UV)p, (IV)PL_regoffs[p].start,
 361      (IV)(PL_reg_start_tmp[p] - PL_bostr),
 362      (IV)PL_regoffs[p].end
 363   ));
 364  }
 365 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 366  SSPUSHPTR(PL_regoffs);
 367  SSPUSHINT(PL_regsize);
 368  SSPUSHINT(*PL_reglastparen);
 369  SSPUSHINT(*PL_reglastcloseparen);
 370  SSPUSHPTR(PL_reginput);
 371  SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 372
 373  return retval;
 374 }
 375
 376 /* These are needed since we do not localize EVAL nodes: */
 377 #define REGCP_SET(cp)                                           \
 378  DEBUG_STATE_r(                                              \
 379    PerlIO_printf(Perl_debug_log,          \
 380     "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 381     (IV)PL_savestack_ix));                          \
 382  cp = PL_savestack_ix
 383
 384 #define REGCP_UNWIND(cp)                                        \
 385  DEBUG_STATE_r(                                              \
 386   if (cp != PL_savestack_ix)                   \
 387     PerlIO_printf(Perl_debug_log,          \
 388     "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 389     (IV)(cp), (IV)PL_savestack_ix));                \
 390  regcpblow(cp)
 391
 392 STATIC char *
 393 S_regcppop(pTHX_ const regexp *rex)
 394 {
 395  dVAR;
 396  UV i;
 397  char *input;
 398  GET_RE_DEBUG_FLAGS_DECL;
 399
 400  PERL_ARGS_ASSERT_REGCPPOP;
 401
 402  /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 403  i = SSPOPUV;
 404  assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 405  i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 406  input = (char *) SSPOPPTR;
 407  *PL_reglastcloseparen = SSPOPINT;
 408  *PL_reglastparen = SSPOPINT;
 409  PL_regsize = SSPOPINT;
 410  PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 411
 412  i -= REGCP_OTHER_ELEMS;
 413  /* Now restore the parentheses context. */
 414  for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 415   I32 tmps;
 416   U32 paren = (U32)SSPOPINT;
 417   PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 418   PL_regoffs[paren].start = SSPOPINT;
 419   tmps = SSPOPINT;
 420   if (paren <= *PL_reglastparen)
 421    PL_regoffs[paren].end = tmps;
 422   DEBUG_BUFFERS_r(
 423    PerlIO_printf(Perl_debug_log,
 424       "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 425       (UV)paren, (IV)PL_regoffs[paren].start,
 426       (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 427       (IV)PL_regoffs[paren].end,
 428       (paren > *PL_reglastparen ? "(no)" : ""));
 429   );
 430  }
 431  DEBUG_BUFFERS_r(
 432   if (*PL_reglastparen + 1 <= rex->nparens) {
 433    PerlIO_printf(Perl_debug_log,
 434       "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 435       (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 436   }
 437  );
 438 #if 1
 439  /* It would seem that the similar code in regtry()
 440  * already takes care of this, and in fact it is in
 441  * a better location to since this code can #if 0-ed out
 442  * but the code in regtry() is needed or otherwise tests
 443  * requiring null fields (pat.t#187 and split.t#{13,14}
 444  * (as of patchlevel 7877)  will fail.  Then again,
 445  * this code seems to be necessary or otherwise
 446  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 447  * --jhi updated by dapm */
 448  for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 449   if (i > PL_regsize)
 450    PL_regoffs[i].start = -1;
 451   PL_regoffs[i].end = -1;
 452  }
 453 #endif
 454  return input;
 455 }
 456
 457 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
 458
 459 /*
 460  * pregexec and friends
 461  */
 462
 463 #ifndef PERL_IN_XSUB_RE
 464 /*
 465  - pregexec - match a regexp against a string
 466  */
 467 I32
 468 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 469   char *strbeg, I32 minend, SV *screamer, U32 nosave)
 470 /* strend: pointer to null at end of string */
 471 /* strbeg: real beginning of string */
 472 /* minend: end of match must be >=minend after stringarg. */
 473 /* nosave: For optimizations. */
 474 {
 475  PERL_ARGS_ASSERT_PREGEXEC;
 476
 477  return
 478   regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 479      nosave ? 0 : REXEC_COPY_STR);
 480 }
 481 #endif
 482
 483 /*
 484  * Need to implement the following flags for reg_anch:
 485  *
 486  * USE_INTUIT_NOML  - Useful to call re_intuit_start() first
 487  * USE_INTUIT_ML
 488  * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
 489  * INTUIT_AUTORITATIVE_ML
 490  * INTUIT_ONCE_NOML  - Intuit can match in one location only.
 491  * INTUIT_ONCE_ML
 492  *
 493  * Another flag for this function: SECOND_TIME (so that float substrs
 494  * with giant delta may be not rechecked).
 495  */
 496
 497 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 498
 499 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 500    Otherwise, only SvCUR(sv) is used to get strbeg. */
 501
 502 /* XXXX We assume that strpos is strbeg unless sv. */
 503
 504 /* XXXX Some places assume that there is a fixed substring.
 505   An update may be needed if optimizer marks as "INTUITable"
 506   RExen without fixed substrings.  Similarly, it is assumed that
 507   lengths of all the strings are no more than minlen, thus they
 508   cannot come from lookahead.
 509   (Or minlen should take into account lookahead.)
 510   NOTE: Some of this comment is not correct. minlen does now take account
 511   of lookahead/behind. Further research is required. -- demerphq
 512
 513 */
 514
 515 /* A failure to find a constant substring means that there is no need to make
 516    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 517    finding a substring too deep into the string means that less calls to
 518    regtry() should be needed.
 519
 520    REx compiler's optimizer found 4 possible hints:
 521   a) Anchored substring;
 522   b) Fixed substring;
 523   c) Whether we are anchored (beginning-of-line or \G);
 524   d) First node (of those at offset 0) which may distinguish positions;
 525    We use a)b)d) and multiline-part of c), and try to find a position in the
 526    string which does not contradict any of them.
 527  */
 528
 529 /* Most of decisions we do here should have been done at compile time.
 530    The nodes of the REx which we used for the search should have been
 531    deleted from the finite automaton. */
 532
 533 char *
 534 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 535      char *strend, const U32 flags, re_scream_pos_data *data)
 536 {
 537  dVAR;
 538  struct regexp *const prog = (struct regexp *)SvANY(rx);
 539  register I32 start_shift = 0;
 540  /* Should be nonnegative! */
 541  register I32 end_shift   = 0;
 542  register char *s;
 543  register SV *check;
 544  char *strbeg;
 545  char *t;
 546  const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 547  I32 ml_anch;
 548  register char *other_last = NULL; /* other substr checked before this */
 549  char *check_at = NULL;  /* check substr found at this pos */
 550  const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 551  RXi_GET_DECL(prog,progi);
 552 #ifdef DEBUGGING
 553  const char * const i_strpos = strpos;
 554 #endif
 555  GET_RE_DEBUG_FLAGS_DECL;
 556
 557  PERL_ARGS_ASSERT_RE_INTUIT_START;
 558
 559  RX_MATCH_UTF8_set(rx,utf8_target);
 560
 561  if (RX_UTF8(rx)) {
 562   PL_reg_flags |= RF_utf8;
 563  }
 564  DEBUG_EXECUTE_r(
 565   debug_start_match(rx, utf8_target, strpos, strend,
 566    sv ? "Guessing start of match in sv for"
 567    : "Guessing start of match in string for");
 568    );
 569
 570  /* CHR_DIST() would be more correct here but it makes things slow. */
 571  if (prog->minlen > strend - strpos) {
 572   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 573        "String too short... [re_intuit_start]\n"));
 574   goto fail;
 575  }
 576
 577  strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 578  PL_regeol = strend;
 579  if (utf8_target) {
 580   if (!prog->check_utf8 && prog->check_substr)
 581    to_utf8_substr(prog);
 582   check = prog->check_utf8;
 583  } else {
 584   if (!prog->check_substr && prog->check_utf8)
 585    to_byte_substr(prog);
 586   check = prog->check_substr;
 587  }
 588  if (check == &PL_sv_undef) {
 589   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 590     "Non-utf8 string cannot match utf8 check string\n"));
 591   goto fail;
 592  }
 593  if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
 594   ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 595      || ( (prog->extflags & RXf_ANCH_BOL)
 596       && !multiline ) ); /* Check after \n? */
 597
 598   if (!ml_anch) {
 599   if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 600     && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 601    /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 602    && sv && !SvROK(sv)
 603    && (strpos != strbeg)) {
 604    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 605    goto fail;
 606   }
 607   if (prog->check_offset_min == prog->check_offset_max &&
 608    !(prog->extflags & RXf_CANY_SEEN)) {
 609    /* Substring at constant offset from beg-of-str... */
 610    I32 slen;
 611
 612    s = HOP3c(strpos, prog->check_offset_min, strend);
 613
 614    if (SvTAIL(check)) {
 615     slen = SvCUR(check); /* >= 1 */
 616
 617     if ( strend - s > slen || strend - s < slen - 1
 618      || (strend - s == slen && strend[-1] != '\n')) {
 619      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 620      goto fail_finish;
 621     }
 622     /* Now should match s[0..slen-2] */
 623     slen--;
 624     if (slen && (*SvPVX_const(check) != *s
 625        || (slen > 1
 626         && memNE(SvPVX_const(check), s, slen)))) {
 627     report_neq:
 628      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 629      goto fail_finish;
 630     }
 631    }
 632    else if (*SvPVX_const(check) != *s
 633      || ((slen = SvCUR(check)) > 1
 634       && memNE(SvPVX_const(check), s, slen)))
 635     goto report_neq;
 636    check_at = s;
 637    goto success_at_start;
 638   }
 639   }
 640   /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 641   s = strpos;
 642   start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 643   end_shift = prog->check_end_shift;
 644
 645   if (!ml_anch) {
 646    const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 647           - (SvTAIL(check) != 0);
 648    const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 649
 650    if (end_shift < eshift)
 651     end_shift = eshift;
 652   }
 653  }
 654  else {    /* Can match at random position */
 655   ml_anch = 0;
 656   s = strpos;
 657   start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 658   end_shift = prog->check_end_shift;
 659
 660   /* end shift should be non negative here */
 661  }
 662
 663 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
 664  if (end_shift < 0)
 665   Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 666     (IV)end_shift, RX_PRECOMP(prog));
 667 #endif
 668
 669   restart:
 670  /* Find a possible match in the region s..strend by looking for
 671  the "check" substring in the region corrected by start/end_shift. */
 672
 673  {
 674   I32 srch_start_shift = start_shift;
 675   I32 srch_end_shift = end_shift;
 676   if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 677    srch_end_shift -= ((strbeg - s) - srch_start_shift);
 678    srch_start_shift = strbeg - s;
 679   }
 680  DEBUG_OPTIMISE_MORE_r({
 681   PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 682    (IV)prog->check_offset_min,
 683    (IV)srch_start_shift,
 684    (IV)srch_end_shift,
 685    (IV)prog->check_end_shift);
 686  });
 687
 688  if (flags & REXEC_SCREAM) {
 689   I32 p = -1;   /* Internal iterator of scream. */
 690   I32 * const pp = data ? data->scream_pos : &p;
 691
 692   if (PL_screamfirst[BmRARE(check)] >= 0
 693    || ( BmRARE(check) == '\n'
 694     && (BmPREVIOUS(check) == SvCUR(check) - 1)
 695     && SvTAIL(check) ))
 696    s = screaminstr(sv, check,
 697        srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 698   else
 699    goto fail_finish;
 700   /* we may be pointing at the wrong string */
 701   if (s && RXp_MATCH_COPIED(prog))
 702    s = strbeg + (s - SvPVX_const(sv));
 703   if (data)
 704    *data->scream_olds = s;
 705  }
 706  else {
 707   U8* start_point;
 708   U8* end_point;
 709   if (prog->extflags & RXf_CANY_SEEN) {
 710    start_point= (U8*)(s + srch_start_shift);
 711    end_point= (U8*)(strend - srch_end_shift);
 712   } else {
 713    start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 714    end_point= HOP3(strend, -srch_end_shift, strbeg);
 715   }
 716   DEBUG_OPTIMISE_MORE_r({
 717    PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 718     (int)(end_point - start_point),
 719     (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 720     start_point);
 721   });
 722
 723   s = fbm_instr( start_point, end_point,
 724      check, multiline ? FBMrf_MULTILINE : 0);
 725  }
 726  }
 727  /* Update the count-of-usability, remove useless subpatterns,
 728   unshift s.  */
 729
 730  DEBUG_EXECUTE_r({
 731   RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 732    SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 733   PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 734       (s ? "Found" : "Did not find"),
 735    (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 736     ? "anchored" : "floating"),
 737    quoted,
 738    RE_SV_TAIL(check),
 739    (s ? " at offset " : "...\n") );
 740  });
 741
 742  if (!s)
 743   goto fail_finish;
 744  /* Finish the diagnostic message */
 745  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 746
 747  /* XXX dmq: first branch is for positive lookbehind...
 748  Our check string is offset from the beginning of the pattern.
 749  So we need to do any stclass tests offset forward from that
 750  point. I think. :-(
 751  */
 752
 753
 754
 755  check_at=s;
 756
 757
 758  /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 759  Start with the other substr.
 760  XXXX no SCREAM optimization yet - and a very coarse implementation
 761  XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 762     *always* match.  Probably should be marked during compile...
 763  Probably it is right to do no SCREAM here...
 764  */
 765
 766  if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 767     : (prog->float_substr && prog->anchored_substr))
 768  {
 769   /* Take into account the "other" substring. */
 770   /* XXXX May be hopelessly wrong for UTF... */
 771   if (!other_last)
 772    other_last = strpos;
 773   if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 774   do_other_anchored:
 775    {
 776     char * const last = HOP3c(s, -start_shift, strbeg);
 777     char *last1, *last2;
 778     char * const saved_s = s;
 779     SV* must;
 780
 781     t = s - prog->check_offset_max;
 782     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 783      && (!utf8_target
 784       || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 785        && t > strpos)))
 786      NOOP;
 787     else
 788      t = strpos;
 789     t = HOP3c(t, prog->anchored_offset, strend);
 790     if (t < other_last) /* These positions already checked */
 791      t = other_last;
 792     last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 793     if (last < last1)
 794      last1 = last;
 795     /* XXXX It is not documented what units *_offsets are in.
 796     We assume bytes, but this is clearly wrong.
 797     Meaning this code needs to be carefully reviewed for errors.
 798     dmq.
 799     */
 800
 801     /* On end-of-str: see comment below. */
 802     must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 803     if (must == &PL_sv_undef) {
 804      s = (char*)NULL;
 805      DEBUG_r(must = prog->anchored_utf8); /* for debug */
 806     }
 807     else
 808      s = fbm_instr(
 809       (unsigned char*)t,
 810       HOP3(HOP3(last1, prog->anchored_offset, strend)
 811         + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 812       must,
 813       multiline ? FBMrf_MULTILINE : 0
 814      );
 815     DEBUG_EXECUTE_r({
 816      RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 817       SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 818      PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 819       (s ? "Found" : "Contradicts"),
 820       quoted, RE_SV_TAIL(must));
 821     });
 822
 823
 824     if (!s) {
 825      if (last1 >= last2) {
 826       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 827             ", giving up...\n"));
 828       goto fail_finish;
 829      }
 830      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 831       ", trying floating at offset %ld...\n",
 832       (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 833      other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 834      s = HOP3c(last, 1, strend);
 835      goto restart;
 836     }
 837     else {
 838      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 839       (long)(s - i_strpos)));
 840      t = HOP3c(s, -prog->anchored_offset, strbeg);
 841      other_last = HOP3c(s, 1, strend);
 842      s = saved_s;
 843      if (t == strpos)
 844       goto try_at_start;
 845      goto try_at_offset;
 846     }
 847    }
 848   }
 849   else {  /* Take into account the floating substring. */
 850    char *last, *last1;
 851    char * const saved_s = s;
 852    SV* must;
 853
 854    t = HOP3c(s, -start_shift, strbeg);
 855    last1 = last =
 856     HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 857    if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 858     last = HOP3c(t, prog->float_max_offset, strend);
 859    s = HOP3c(t, prog->float_min_offset, strend);
 860    if (s < other_last)
 861     s = other_last;
 862  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 863    must = utf8_target ? prog->float_utf8 : prog->float_substr;
 864    /* fbm_instr() takes into account exact value of end-of-str
 865    if the check is SvTAIL(ed).  Since false positives are OK,
 866    and end-of-str is not later than strend we are OK. */
 867    if (must == &PL_sv_undef) {
 868     s = (char*)NULL;
 869     DEBUG_r(must = prog->float_utf8); /* for debug message */
 870    }
 871    else
 872     s = fbm_instr((unsigned char*)s,
 873        (unsigned char*)last + SvCUR(must)
 874         - (SvTAIL(must)!=0),
 875        must, multiline ? FBMrf_MULTILINE : 0);
 876    DEBUG_EXECUTE_r({
 877     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 878      SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 879     PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 880      (s ? "Found" : "Contradicts"),
 881      quoted, RE_SV_TAIL(must));
 882    });
 883    if (!s) {
 884     if (last1 == last) {
 885      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 886            ", giving up...\n"));
 887      goto fail_finish;
 888     }
 889     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 890      ", trying anchored starting at offset %ld...\n",
 891      (long)(saved_s + 1 - i_strpos)));
 892     other_last = last;
 893     s = HOP3c(t, 1, strend);
 894     goto restart;
 895    }
 896    else {
 897     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 898      (long)(s - i_strpos)));
 899     other_last = s; /* Fix this later. --Hugo */
 900     s = saved_s;
 901     if (t == strpos)
 902      goto try_at_start;
 903     goto try_at_offset;
 904    }
 905   }
 906  }
 907
 908
 909  t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 910
 911  DEBUG_OPTIMISE_MORE_r(
 912   PerlIO_printf(Perl_debug_log,
 913    "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 914    (IV)prog->check_offset_min,
 915    (IV)prog->check_offset_max,
 916    (IV)(s-strpos),
 917    (IV)(t-strpos),
 918    (IV)(t-s),
 919    (IV)(strend-strpos)
 920   )
 921  );
 922
 923  if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 924   && (!utf8_target
 925    || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 926     && t > strpos)))
 927  {
 928   /* Fixed substring is found far enough so that the match
 929   cannot start at strpos. */
 930  try_at_offset:
 931   if (ml_anch && t[-1] != '\n') {
 932    /* Eventually fbm_*() should handle this, but often
 933    anchored_offset is not 0, so this check will not be wasted. */
 934    /* XXXX In the code below we prefer to look for "^" even in
 935    presence of anchored substrings.  And we search even
 936    beyond the found float position.  These pessimizations
 937    are historical artefacts only.  */
 938   find_anchor:
 939    while (t < strend - prog->minlen) {
 940     if (*t == '\n') {
 941      if (t < check_at - prog->check_offset_min) {
 942       if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 943        /* Since we moved from the found position,
 944        we definitely contradict the found anchored
 945        substr.  Due to the above check we do not
 946        contradict "check" substr.
 947        Thus we can arrive here only if check substr
 948        is float.  Redo checking for "other"=="fixed".
 949        */
 950        strpos = t + 1;
 951        DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 952         PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 953        goto do_other_anchored;
 954       }
 955       /* We don't contradict the found floating substring. */
 956       /* XXXX Why not check for STCLASS? */
 957       s = t + 1;
 958       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 959        PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 960       goto set_useful;
 961      }
 962      /* Position contradicts check-string */
 963      /* XXXX probably better to look for check-string
 964      than for "\n", so one should lower the limit for t? */
 965      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 966       PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 967      other_last = strpos = s = t + 1;
 968      goto restart;
 969     }
 970     t++;
 971    }
 972    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 973       PL_colors[0], PL_colors[1]));
 974    goto fail_finish;
 975   }
 976   else {
 977    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 978       PL_colors[0], PL_colors[1]));
 979   }
 980   s = t;
 981  set_useful:
 982   ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
 983  }
 984  else {
 985   /* The found string does not prohibit matching at strpos,
 986   - no optimization of calling REx engine can be performed,
 987   unless it was an MBOL and we are not after MBOL,
 988   or a future STCLASS check will fail this. */
 989  try_at_start:
 990   /* Even in this situation we may use MBOL flag if strpos is offset
 991   wrt the start of the string. */
 992   if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 993    && (strpos != strbeg) && strpos[-1] != '\n'
 994    /* May be due to an implicit anchor of m{.*foo}  */
 995    && !(prog->intflags & PREGf_IMPLICIT))
 996   {
 997    t = strpos;
 998    goto find_anchor;
 999   }
1000   DEBUG_EXECUTE_r( if (ml_anch)
1001    PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1002       (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1003   );
1004  success_at_start:
1005   if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1006    && (utf8_target ? (
1007     prog->check_utf8  /* Could be deleted already */
1008     && --BmUSEFUL(prog->check_utf8) < 0
1009     && (prog->check_utf8 == prog->float_utf8)
1010    ) : (
1011     prog->check_substr  /* Could be deleted already */
1012     && --BmUSEFUL(prog->check_substr) < 0
1013     && (prog->check_substr == prog->float_substr)
1014    )))
1015   {
1016    /* If flags & SOMETHING - do not do it many times on the same match */
1017    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1018    /* XXX Does the destruction order has to change with utf8_target? */
1019    SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1020    SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1021    prog->check_substr = prog->check_utf8 = NULL; /* disable */
1022    prog->float_substr = prog->float_utf8 = NULL; /* clear */
1023    check = NULL;   /* abort */
1024    s = strpos;
1025    /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1026      see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1027    if (prog->intflags & PREGf_IMPLICIT)
1028     prog->extflags &= ~RXf_ANCH_MBOL;
1029    /* XXXX This is a remnant of the old implementation.  It
1030      looks wasteful, since now INTUIT can use many
1031      other heuristics. */
1032    prog->extflags &= ~RXf_USE_INTUIT;
1033    /* XXXX What other flags might need to be cleared in this branch? */
1034   }
1035   else
1036    s = strpos;
1037  }
1038
1039  /* Last resort... */
1040  /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1041  /* trie stclasses are too expensive to use here, we are better off to
1042  leave it to regmatch itself */
1043  if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1044   /* minlen == 0 is possible if regstclass is \b or \B,
1045   and the fixed substr is ''$.
1046   Since minlen is already taken into account, s+1 is before strend;
1047   accidentally, minlen >= 1 guaranties no false positives at s + 1
1048   even for \b or \B.  But (minlen? 1 : 0) below assumes that
1049   regstclass does not come from lookahead...  */
1050   /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1051   This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1052   const U8* const str = (U8*)STRING(progi->regstclass);
1053   const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1054      ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1055      : 1);
1056   char * endpos;
1057   if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1058    endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1059   else if (prog->float_substr || prog->float_utf8)
1060    endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1061   else
1062    endpos= strend;
1063
1064   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1065          (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1066
1067   t = s;
1068   s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1069   if (!s) {
1070 #ifdef DEBUGGING
1071    const char *what = NULL;
1072 #endif
1073    if (endpos == strend) {
1074     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1075         "Could not match STCLASS...\n") );
1076     goto fail;
1077    }
1078    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1079         "This position contradicts STCLASS...\n") );
1080    if ((prog->extflags & RXf_ANCH) && !ml_anch)
1081     goto fail;
1082    /* Contradict one of substrings */
1083    if (prog->anchored_substr || prog->anchored_utf8) {
1084     if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1085      DEBUG_EXECUTE_r( what = "anchored" );
1086     hop_and_restart:
1087      s = HOP3c(t, 1, strend);
1088      if (s + start_shift + end_shift > strend) {
1089       /* XXXX Should be taken into account earlier? */
1090       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1091            "Could not match STCLASS...\n") );
1092       goto fail;
1093      }
1094      if (!check)
1095       goto giveup;
1096      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1097         "Looking for %s substr starting at offset %ld...\n",
1098         what, (long)(s + start_shift - i_strpos)) );
1099      goto restart;
1100     }
1101     /* Have both, check_string is floating */
1102     if (t + start_shift >= check_at) /* Contradicts floating=check */
1103      goto retry_floating_check;
1104     /* Recheck anchored substring, but not floating... */
1105     s = check_at;
1106     if (!check)
1107      goto giveup;
1108     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1109       "Looking for anchored substr starting at offset %ld...\n",
1110       (long)(other_last - i_strpos)) );
1111     goto do_other_anchored;
1112    }
1113    /* Another way we could have checked stclass at the
1114    current position only: */
1115    if (ml_anch) {
1116     s = t = t + 1;
1117     if (!check)
1118      goto giveup;
1119     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1120       "Looking for /%s^%s/m starting at offset %ld...\n",
1121       PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1122     goto try_at_offset;
1123    }
1124    if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1125     goto fail;
1126    /* Check is floating substring. */
1127   retry_floating_check:
1128    t = check_at - start_shift;
1129    DEBUG_EXECUTE_r( what = "floating" );
1130    goto hop_and_restart;
1131   }
1132   if (t != s) {
1133    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1134       "By STCLASS: moving %ld --> %ld\n",
1135         (long)(t - i_strpos), (long)(s - i_strpos))
1136     );
1137   }
1138   else {
1139    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1140         "Does not contradict STCLASS...\n");
1141     );
1142   }
1143  }
1144   giveup:
1145  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1146       PL_colors[4], (check ? "Guessed" : "Giving up"),
1147       PL_colors[5], (long)(s - i_strpos)) );
1148  return s;
1149
1150   fail_finish:    /* Substring not found */
1151  if (prog->check_substr || prog->check_utf8)  /* could be removed already */
1152   BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1153   fail:
1154  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1155       PL_colors[4], PL_colors[5]));
1156  return NULL;
1157 }
1158
1159 #define DECL_TRIE_TYPE(scan) \
1160  const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1161      trie_type = (scan->flags != EXACT) \
1162        ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1163        : (utf8_target ? trie_utf8 : trie_plain)
1164
1165 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1166 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1167  switch (trie_type) {                                                    \
1168  case trie_utf8_fold:                                                    \
1169   if ( foldlen>0 ) {                                                  \
1170    uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1171    foldlen -= len;                                                 \
1172    uscan += len;                                                   \
1173    len=0;                                                          \
1174   } else {                                                            \
1175    uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1176    uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1177    foldlen -= UNISKIP( uvc );                                      \
1178    uscan = foldbuf + UNISKIP( uvc );                               \
1179   }                                                                   \
1180   break;                                                              \
1181  case trie_latin_utf8_fold:                                              \
1182   if ( foldlen>0 ) {                                                  \
1183    uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1184    foldlen -= len;                                                 \
1185    uscan += len;                                                   \
1186    len=0;                                                          \
1187   } else {                                                            \
1188    len = 1;                                                        \
1189    uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1190    foldlen -= UNISKIP( uvc );                                      \
1191    uscan = foldbuf + UNISKIP( uvc );                               \
1192   }                                                                   \
1193   break;                                                              \
1194  case trie_utf8:                                                         \
1195   uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1196   break;                                                              \
1197  case trie_plain:                                                        \
1198   uvc = (UV)*uc;                                                      \
1199   len = 1;                                                            \
1200  }                                                                       \
1201  if (uvc < 256) {                                                        \
1202   charid = trie->charmap[ uvc ];                                      \
1203  }                                                                       \
1204  else {                                                                  \
1205   charid = 0;                                                         \
1206   if (widecharmap) {                                                  \
1207    SV** const svpp = hv_fetch(widecharmap,                         \
1208       (char*)&uvc, sizeof(UV), 0);                        \
1209    if (svpp)                                                       \
1210     charid = (U16)SvIV(*svpp);                                  \
1211   }                                                                   \
1212  }                                                                       \
1213 } STMT_END
1214
1215 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1216 STMT_START {                                              \
1217  while (s <= e) {                                      \
1218   if ( (CoNd)                                       \
1219    && (ln == 1 || folder(s, pat_string, ln))    \
1220    && (!reginfo || regtry(reginfo, &s)) )       \
1221    goto got_it;                                  \
1222   s++;                                              \
1223  }                                                     \
1224 } STMT_END
1225
1226 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1227 STMT_START {                                          \
1228  while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1229   CoDe                                          \
1230   s += uskip;                                   \
1231  }                                                 \
1232 } STMT_END
1233
1234 #define REXEC_FBC_SCAN(CoDe)                          \
1235 STMT_START {                                          \
1236  while (s < strend) {                              \
1237   CoDe                                          \
1238   s++;                                          \
1239  }                                                 \
1240 } STMT_END
1241
1242 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1243 REXEC_FBC_UTF8_SCAN(                                  \
1244  if (CoNd) {                                       \
1245   if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1246    goto got_it;                              \
1247   else                                          \
1248    tmp = doevery;                            \
1249  }                                                 \
1250  else                                              \
1251   tmp = 1;                                      \
1252 )
1253
1254 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1255 REXEC_FBC_SCAN(                                       \
1256  if (CoNd) {                                       \
1257   if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1258    goto got_it;                              \
1259   else                                          \
1260    tmp = doevery;                            \
1261  }                                                 \
1262  else                                              \
1263   tmp = 1;                                      \
1264 )
1265
1266 #define REXEC_FBC_TRYIT               \
1267 if ((!reginfo || regtry(reginfo, &s))) \
1268  goto got_it
1269
1270 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1271  if (utf8_target) {                                             \
1272   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1273  }                                                          \
1274  else {                                                     \
1275   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1276  }
1277
1278 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1279  if (utf8_target) {                                             \
1280   UtFpReLoAd;                                            \
1281   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1282  }                                                          \
1283  else {                                                     \
1284   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1285  }
1286
1287 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1288  PL_reg_flags |= RF_tainted;                                \
1289  if (utf8_target) {                                             \
1290   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1291  }                                                          \
1292  else {                                                     \
1293   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1294  }
1295
1296 #define DUMP_EXEC_POS(li,s,doutf8) \
1297  dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1298
1299
1300 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1301   tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1302   tmp = TEST_NON_UTF8(tmp);                                              \
1303   REXEC_FBC_UTF8_SCAN(                                                   \
1304    if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1305     tmp = !tmp;                                                    \
1306     IF_SUCCESS;                                                    \
1307    }                                                                  \
1308    else {                                                             \
1309     IF_FAIL;                                                       \
1310    }                                                                  \
1311   );                                                                     \
1312
1313 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1314   if (s == PL_bostr) {                                                   \
1315    tmp = '\n';                                                        \
1316   }                                                                      \
1317   else {                                                                 \
1318    U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1319    tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1320   }                                                                      \
1321   tmp = TeSt1_UtF8;                                                      \
1322   LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1323   REXEC_FBC_UTF8_SCAN(                                                   \
1324    if (tmp == ! (TeSt2_UtF8)) { \
1325     tmp = !tmp;                                                    \
1326     IF_SUCCESS;                                                    \
1327    }                                                                  \
1328    else {                                                             \
1329     IF_FAIL;                                                       \
1330    }                                                                  \
1331   );                                                                     \
1332
1333 /* The only difference between the BOUND and NBOUND cases is that
1334  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1335  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1336  * with the other one being empty */
1337 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1338  FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1339
1340 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1341  FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1342
1343 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1344  FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1345
1346 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1347  FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1348
1349
1350 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1351  * be passed in completely with the variable name being tested, which isn't
1352  * such a clean interface, but this is easier to read than it was before.  We
1353  * are looking for the boundary (or non-boundary between a word and non-word
1354  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1355  * must be different.  Find the "wordness" of the character just prior to this
1356  * one, and compare it with the wordness of this one.  If they differ, we have
1357  * a boundary.  At the beginning of the string, pretend that the previous
1358  * character was a new-line */
1359 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1360  if (utf8_target) {                                                         \
1361     UTF8_CODE \
1362  }                                                                          \
1363  else {  /* Not utf8 */                                                     \
1364   tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1365   tmp = TEST_NON_UTF8(tmp);                                              \
1366   REXEC_FBC_SCAN(                                                        \
1367    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1368     tmp = !tmp;                                                    \
1369     IF_SUCCESS;                                                    \
1370    }                                                                  \
1371    else {                                                             \
1372     IF_FAIL;                                                       \
1373    }                                                                  \
1374   );                                                                     \
1375  }                                                                          \
1376  if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1377   goto got_it;
1378
1379 /* We know what class REx starts with.  Try to find this position... */
1380 /* if reginfo is NULL, its a dryrun */
1381 /* annoyingly all the vars in this routine have different names from their counterparts
1382    in regmatch. /grrr */
1383
1384 STATIC char *
1385 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1386  const char *strend, regmatch_info *reginfo)
1387 {
1388   dVAR;
1389   const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1390   char *pat_string;   /* The pattern's exactish string */
1391   char *pat_end;     /* ptr to end char of pat_string */
1392   re_fold_t folder; /* Function for computing non-utf8 folds */
1393   const U8 *fold_array;   /* array for folding ords < 256 */
1394   STRLEN ln;
1395   STRLEN lnc;
1396   register STRLEN uskip;
1397   U8 c1;
1398   U8 c2;
1399   char *e;
1400   register I32 tmp = 1; /* Scratch variable? */
1401   register const bool utf8_target = PL_reg_match_utf8;
1402   UV utf8_fold_flags = 0;
1403   RXi_GET_DECL(prog,progi);
1404
1405   PERL_ARGS_ASSERT_FIND_BYCLASS;
1406
1407   /* We know what class it must start with. */
1408   switch (OP(c)) {
1409   case ANYOFV:
1410   case ANYOF:
1411    if (utf8_target || OP(c) == ANYOFV) {
1412     STRLEN inclasslen = strend - s;
1413     REXEC_FBC_UTF8_CLASS_SCAN(
1414       reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1415    }
1416    else {
1417     REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1418    }
1419    break;
1420   case CANY:
1421    REXEC_FBC_SCAN(
1422     if (tmp && (!reginfo || regtry(reginfo, &s)))
1423      goto got_it;
1424     else
1425      tmp = doevery;
1426    );
1427    break;
1428
1429   case EXACTFA:
1430    if (UTF_PATTERN || utf8_target) {
1431     utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1432     goto do_exactf_utf8;
1433    }
1434    fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1435    folder = foldEQ_latin1;     /* /a, except the sharp s one which */
1436    goto do_exactf_non_utf8;     /* isn't dealt with by these */
1437
1438   case EXACTFU:
1439    if (UTF_PATTERN || utf8_target) {
1440     utf8_fold_flags = 0;
1441     goto do_exactf_utf8;
1442    }
1443    fold_array = PL_fold_latin1;
1444    folder = foldEQ_latin1;
1445    /* XXX This uses the full utf8 fold because if the pattern contains
1446    * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
1447    * There could be a new node type, say EXACTFU_SS, which is
1448    * generated by regcomp only if there is an 'ss', and then every
1449    * other case could goto do_exactf_non_utf8;*/
1450    goto do_exactf_utf8;
1451
1452   case EXACTF:
1453    if (UTF_PATTERN || utf8_target) {
1454     utf8_fold_flags = 0;
1455     goto do_exactf_utf8;
1456    }
1457    fold_array = PL_fold;
1458    folder = foldEQ;
1459    goto do_exactf_non_utf8;
1460
1461   case EXACTFL:
1462    if (UTF_PATTERN || utf8_target) {
1463     utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1464     goto do_exactf_utf8;
1465    }
1466    fold_array = PL_fold_locale;
1467    folder = foldEQ_locale;
1468
1469    /* FALL THROUGH */
1470
1471   do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1472
1473    /* The idea in the non-utf8 EXACTF* cases is to first find the
1474    * first character of the EXACTF* node and then, if necessary,
1475    * case-insensitively compare the full text of the node.  c1 is the
1476    * first character.  c2 is its fold.  This logic will not work for
1477    * Unicode semantics and the german sharp ss, which hence should
1478    * not be compiled into a node that gets here. */
1479    pat_string = STRING(c);
1480    ln  = STR_LEN(c); /* length to match in octets/bytes */
1481
1482    e = HOP3c(strend, -((I32)ln), s);
1483
1484    if (!reginfo && e < s) {
1485     e = s;   /* Due to minlen logic of intuit() */
1486    }
1487
1488    c1 = *pat_string;
1489    c2 = fold_array[c1];
1490    if (c1 == c2) { /* If char and fold are the same */
1491     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1492    }
1493    else {
1494     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1495    }
1496    break;
1497
1498   do_exactf_utf8:
1499
1500    /* If one of the operands is in utf8, we can't use the simpler
1501    * folding above, due to the fact that many different characters
1502    * can have the same fold, or portion of a fold, or different-
1503    * length fold */
1504    pat_string = STRING(c);
1505    ln  = STR_LEN(c); /* length to match in octets/bytes */
1506    pat_end = pat_string + ln;
1507    lnc = (UTF_PATTERN) /* length to match in characters */
1508      ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1509      : ln;
1510
1511    /* Set the end position to the final character available */
1512    e = HOP3c(strend, -1, s);
1513
1514    if (!reginfo && e < s) {
1515     e = s;   /* Due to minlen logic of intuit() */
1516    }
1517
1518    while (s <= e) {
1519     char *my_strend= (char *)strend;
1520     if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1521      pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1522      && (!reginfo || regtry(reginfo, &s)) )
1523     {
1524      goto got_it;
1525     }
1526     s += (utf8_target) ? UTF8SKIP(s) : 1;
1527    }
1528    break;
1529   case BOUNDL:
1530    PL_reg_flags |= RF_tainted;
1531    FBC_BOUND(isALNUM_LC,
1532      isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1533      isALNUM_LC_utf8((U8*)s));
1534    break;
1535   case NBOUNDL:
1536    PL_reg_flags |= RF_tainted;
1537    FBC_NBOUND(isALNUM_LC,
1538      isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1539      isALNUM_LC_utf8((U8*)s));
1540    break;
1541   case BOUND:
1542    FBC_BOUND(isWORDCHAR,
1543      isALNUM_uni(tmp),
1544      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1545    break;
1546   case BOUNDA:
1547    FBC_BOUND_NOLOAD(isWORDCHAR_A,
1548        isWORDCHAR_A(tmp),
1549        isWORDCHAR_A((U8*)s));
1550    break;
1551   case NBOUND:
1552    FBC_NBOUND(isWORDCHAR,
1553      isALNUM_uni(tmp),
1554      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1555    break;
1556   case NBOUNDA:
1557    FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1558        isWORDCHAR_A(tmp),
1559        isWORDCHAR_A((U8*)s));
1560    break;
1561   case BOUNDU:
1562    FBC_BOUND(isWORDCHAR_L1,
1563      isALNUM_uni(tmp),
1564      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1565    break;
1566   case NBOUNDU:
1567    FBC_NBOUND(isWORDCHAR_L1,
1568      isALNUM_uni(tmp),
1569      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1570    break;
1571   case ALNUML:
1572    REXEC_FBC_CSCAN_TAINT(
1573     isALNUM_LC_utf8((U8*)s),
1574     isALNUM_LC(*s)
1575    );
1576    break;
1577   case ALNUMU:
1578    REXEC_FBC_CSCAN_PRELOAD(
1579     LOAD_UTF8_CHARCLASS_ALNUM(),
1580     swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1581     isWORDCHAR_L1((U8) *s)
1582    );
1583    break;
1584   case ALNUM:
1585    REXEC_FBC_CSCAN_PRELOAD(
1586     LOAD_UTF8_CHARCLASS_ALNUM(),
1587     swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1588     isWORDCHAR((U8) *s)
1589    );
1590    break;
1591   case ALNUMA:
1592    /* Don't need to worry about utf8, as it can match only a single
1593    * byte invariant character */
1594    REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1595    break;
1596   case NALNUMU:
1597    REXEC_FBC_CSCAN_PRELOAD(
1598     LOAD_UTF8_CHARCLASS_ALNUM(),
1599     !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1600     ! isWORDCHAR_L1((U8) *s)
1601    );
1602    break;
1603   case NALNUM:
1604    REXEC_FBC_CSCAN_PRELOAD(
1605     LOAD_UTF8_CHARCLASS_ALNUM(),
1606     !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1607     ! isALNUM(*s)
1608    );
1609    break;
1610   case NALNUMA:
1611    REXEC_FBC_CSCAN(
1612     !isWORDCHAR_A(*s),
1613     !isWORDCHAR_A(*s)
1614    );
1615    break;
1616   case NALNUML:
1617    REXEC_FBC_CSCAN_TAINT(
1618     !isALNUM_LC_utf8((U8*)s),
1619     !isALNUM_LC(*s)
1620    );
1621    break;
1622   case SPACEU:
1623    REXEC_FBC_CSCAN_PRELOAD(
1624     LOAD_UTF8_CHARCLASS_SPACE(),
1625     *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1626     isSPACE_L1((U8) *s)
1627    );
1628    break;
1629   case SPACE:
1630    REXEC_FBC_CSCAN_PRELOAD(
1631     LOAD_UTF8_CHARCLASS_SPACE(),
1632     *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1633     isSPACE((U8) *s)
1634    );
1635    break;
1636   case SPACEA:
1637    /* Don't need to worry about utf8, as it can match only a single
1638    * byte invariant character */
1639    REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1640    break;
1641   case SPACEL:
1642    REXEC_FBC_CSCAN_TAINT(
1643     isSPACE_LC_utf8((U8*)s),
1644     isSPACE_LC(*s)
1645    );
1646    break;
1647   case NSPACEU:
1648    REXEC_FBC_CSCAN_PRELOAD(
1649     LOAD_UTF8_CHARCLASS_SPACE(),
1650     !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1651     ! isSPACE_L1((U8) *s)
1652    );
1653    break;
1654   case NSPACE:
1655    REXEC_FBC_CSCAN_PRELOAD(
1656     LOAD_UTF8_CHARCLASS_SPACE(),
1657     !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1658     ! isSPACE((U8) *s)
1659    );
1660    break;
1661   case NSPACEA:
1662    REXEC_FBC_CSCAN(
1663     !isSPACE_A(*s),
1664     !isSPACE_A(*s)
1665    );
1666    break;
1667   case NSPACEL:
1668    REXEC_FBC_CSCAN_TAINT(
1669     !isSPACE_LC_utf8((U8*)s),
1670     !isSPACE_LC(*s)
1671    );
1672    break;
1673   case DIGIT:
1674    REXEC_FBC_CSCAN_PRELOAD(
1675     LOAD_UTF8_CHARCLASS_DIGIT(),
1676     swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1677     isDIGIT(*s)
1678    );
1679    break;
1680   case DIGITA:
1681    /* Don't need to worry about utf8, as it can match only a single
1682    * byte invariant character */
1683    REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1684    break;
1685   case DIGITL:
1686    REXEC_FBC_CSCAN_TAINT(
1687     isDIGIT_LC_utf8((U8*)s),
1688     isDIGIT_LC(*s)
1689    );
1690    break;
1691   case NDIGIT:
1692    REXEC_FBC_CSCAN_PRELOAD(
1693     LOAD_UTF8_CHARCLASS_DIGIT(),
1694     !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1695     !isDIGIT(*s)
1696    );
1697    break;
1698   case NDIGITA:
1699    REXEC_FBC_CSCAN(
1700     !isDIGIT_A(*s),
1701     !isDIGIT_A(*s)
1702    );
1703    break;
1704   case NDIGITL:
1705    REXEC_FBC_CSCAN_TAINT(
1706     !isDIGIT_LC_utf8((U8*)s),
1707     !isDIGIT_LC(*s)
1708    );
1709    break;
1710   case LNBREAK:
1711    REXEC_FBC_CSCAN(
1712     is_LNBREAK_utf8(s),
1713     is_LNBREAK_latin1(s)
1714    );
1715    break;
1716   case VERTWS:
1717    REXEC_FBC_CSCAN(
1718     is_VERTWS_utf8(s),
1719     is_VERTWS_latin1(s)
1720    );
1721    break;
1722   case NVERTWS:
1723    REXEC_FBC_CSCAN(
1724     !is_VERTWS_utf8(s),
1725     !is_VERTWS_latin1(s)
1726    );
1727    break;
1728   case HORIZWS:
1729    REXEC_FBC_CSCAN(
1730     is_HORIZWS_utf8(s),
1731     is_HORIZWS_latin1(s)
1732    );
1733    break;
1734   case NHORIZWS:
1735    REXEC_FBC_CSCAN(
1736     !is_HORIZWS_utf8(s),
1737     !is_HORIZWS_latin1(s)
1738    );
1739    break;
1740   case AHOCORASICKC:
1741   case AHOCORASICK:
1742    {
1743     DECL_TRIE_TYPE(c);
1744     /* what trie are we using right now */
1745     reg_ac_data *aho
1746      = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1747     reg_trie_data *trie
1748      = (reg_trie_data*)progi->data->data[ aho->trie ];
1749     HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1750
1751     const char *last_start = strend - trie->minlen;
1752 #ifdef DEBUGGING
1753     const char *real_start = s;
1754 #endif
1755     STRLEN maxlen = trie->maxlen;
1756     SV *sv_points;
1757     U8 **points; /* map of where we were in the input string
1758         when reading a given char. For ASCII this
1759         is unnecessary overhead as the relationship
1760         is always 1:1, but for Unicode, especially
1761         case folded Unicode this is not true. */
1762     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1763     U8 *bitmap=NULL;
1764
1765
1766     GET_RE_DEBUG_FLAGS_DECL;
1767
1768     /* We can't just allocate points here. We need to wrap it in
1769     * an SV so it gets freed properly if there is a croak while
1770     * running the match */
1771     ENTER;
1772     SAVETMPS;
1773     sv_points=newSV(maxlen * sizeof(U8 *));
1774     SvCUR_set(sv_points,
1775      maxlen * sizeof(U8 *));
1776     SvPOK_on(sv_points);
1777     sv_2mortal(sv_points);
1778     points=(U8**)SvPV_nolen(sv_points );
1779     if ( trie_type != trie_utf8_fold
1780      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1781     {
1782      if (trie->bitmap)
1783       bitmap=(U8*)trie->bitmap;
1784      else
1785       bitmap=(U8*)ANYOF_BITMAP(c);
1786     }
1787     /* this is the Aho-Corasick algorithm modified a touch
1788     to include special handling for long "unknown char"
1789     sequences. The basic idea being that we use AC as long
1790     as we are dealing with a possible matching char, when
1791     we encounter an unknown char (and we have not encountered
1792     an accepting state) we scan forward until we find a legal
1793     starting char.
1794     AC matching is basically that of trie matching, except
1795     that when we encounter a failing transition, we fall back
1796     to the current states "fail state", and try the current char
1797     again, a process we repeat until we reach the root state,
1798     state 1, or a legal transition. If we fail on the root state
1799     then we can either terminate if we have reached an accepting
1800     state previously, or restart the entire process from the beginning
1801     if we have not.
1802
1803     */
1804     while (s <= last_start) {
1805      const U32 uniflags = UTF8_ALLOW_DEFAULT;
1806      U8 *uc = (U8*)s;
1807      U16 charid = 0;
1808      U32 base = 1;
1809      U32 state = 1;
1810      UV uvc = 0;
1811      STRLEN len = 0;
1812      STRLEN foldlen = 0;
1813      U8 *uscan = (U8*)NULL;
1814      U8 *leftmost = NULL;
1815 #ifdef DEBUGGING
1816      U32 accepted_word= 0;
1817 #endif
1818      U32 pointpos = 0;
1819
1820      while ( state && uc <= (U8*)strend ) {
1821       int failed=0;
1822       U32 word = aho->states[ state ].wordnum;
1823
1824       if( state==1 ) {
1825        if ( bitmap ) {
1826         DEBUG_TRIE_EXECUTE_r(
1827          if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1828           dump_exec_pos( (char *)uc, c, strend, real_start,
1829            (char *)uc, utf8_target );
1830           PerlIO_printf( Perl_debug_log,
1831            " Scanning for legal start char...\n");
1832          }
1833         );
1834         if (utf8_target) {
1835          while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1836           uc += UTF8SKIP(uc);
1837          }
1838         } else {
1839          while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1840           uc++;
1841          }
1842         }
1843         s= (char *)uc;
1844        }
1845        if (uc >(U8*)last_start) break;
1846       }
1847
1848       if ( word ) {
1849        U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1850        if (!leftmost || lpos < leftmost) {
1851         DEBUG_r(accepted_word=word);
1852         leftmost= lpos;
1853        }
1854        if (base==0) break;
1855
1856       }
1857       points[pointpos++ % maxlen]= uc;
1858       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1859            uscan, len, uvc, charid, foldlen,
1860            foldbuf, uniflags);
1861       DEBUG_TRIE_EXECUTE_r({
1862        dump_exec_pos( (char *)uc, c, strend, real_start,
1863         s,   utf8_target );
1864        PerlIO_printf(Perl_debug_log,
1865         " Charid:%3u CP:%4"UVxf" ",
1866         charid, uvc);
1867       });
1868
1869       do {
1870 #ifdef DEBUGGING
1871        word = aho->states[ state ].wordnum;
1872 #endif
1873        base = aho->states[ state ].trans.base;
1874
1875        DEBUG_TRIE_EXECUTE_r({
1876         if (failed)
1877          dump_exec_pos( (char *)uc, c, strend, real_start,
1878           s,   utf8_target );
1879         PerlIO_printf( Perl_debug_log,
1880          "%sState: %4"UVxf", word=%"UVxf,
1881          failed ? " Fail transition to " : "",
1882          (UV)state, (UV)word);
1883        });
1884        if ( base ) {
1885         U32 tmp;
1886         I32 offset;
1887         if (charid &&
1888          ( ((offset = base + charid
1889           - 1 - trie->uniquecharcount)) >= 0)
1890          && ((U32)offset < trie->lasttrans)
1891          && trie->trans[offset].check == state
1892          && (tmp=trie->trans[offset].next))
1893         {
1894          DEBUG_TRIE_EXECUTE_r(
1895           PerlIO_printf( Perl_debug_log," - legal\n"));
1896          state = tmp;
1897          break;
1898         }
1899         else {
1900          DEBUG_TRIE_EXECUTE_r(
1901           PerlIO_printf( Perl_debug_log," - fail\n"));
1902          failed = 1;
1903          state = aho->fail[state];
1904         }
1905        }
1906        else {
1907         /* we must be accepting here */
1908         DEBUG_TRIE_EXECUTE_r(
1909           PerlIO_printf( Perl_debug_log," - accepting\n"));
1910         failed = 1;
1911         break;
1912        }
1913       } while(state);
1914       uc += len;
1915       if (failed) {
1916        if (leftmost)
1917         break;
1918        if (!state) state = 1;
1919       }
1920      }
1921      if ( aho->states[ state ].wordnum ) {
1922       U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1923       if (!leftmost || lpos < leftmost) {
1924        DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1925        leftmost = lpos;
1926       }
1927      }
1928      if (leftmost) {
1929       s = (char*)leftmost;
1930       DEBUG_TRIE_EXECUTE_r({
1931        PerlIO_printf(
1932         Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1933         (UV)accepted_word, (IV)(s - real_start)
1934        );
1935       });
1936       if (!reginfo || regtry(reginfo, &s)) {
1937        FREETMPS;
1938        LEAVE;
1939        goto got_it;
1940       }
1941       s = HOPc(s,1);
1942       DEBUG_TRIE_EXECUTE_r({
1943        PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1944       });
1945      } else {
1946       DEBUG_TRIE_EXECUTE_r(
1947        PerlIO_printf( Perl_debug_log,"No match.\n"));
1948       break;
1949      }
1950     }
1951     FREETMPS;
1952     LEAVE;
1953    }
1954    break;
1955   default:
1956    Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1957    break;
1958   }
1959   return 0;
1960  got_it:
1961   return s;
1962 }
1963
1964
1965 /*
1966  - regexec_flags - match a regexp against a string
1967  */
1968 I32
1969 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1970    char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1971 /* strend: pointer to null at end of string */
1972 /* strbeg: real beginning of string */
1973 /* minend: end of match must be >=minend after stringarg. */
1974 /* data: May be used for some additional optimizations.
1975   Currently its only used, with a U32 cast, for transmitting
1976   the ganch offset when doing a /g match. This will change */
1977 /* nosave: For optimizations. */
1978 {
1979  dVAR;
1980  struct regexp *const prog = (struct regexp *)SvANY(rx);
1981  /*register*/ char *s;
1982  register regnode *c;
1983  /*register*/ char *startpos = stringarg;
1984  I32 minlen;  /* must match at least this many chars */
1985  I32 dontbother = 0; /* how many characters not to try at end */
1986  I32 end_shift = 0;   /* Same for the end. */  /* CC */
1987  I32 scream_pos = -1;  /* Internal iterator of scream. */
1988  char *scream_olds = NULL;
1989  const bool utf8_target = cBOOL(DO_UTF8(sv));
1990  I32 multiline;
1991  RXi_GET_DECL(prog,progi);
1992  regmatch_info reginfo;  /* create some info to pass to regtry etc */
1993  regexp_paren_pair *swap = NULL;
1994  GET_RE_DEBUG_FLAGS_DECL;
1995
1996  PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1997  PERL_UNUSED_ARG(data);
1998
1999  /* Be paranoid... */
2000  if (prog == NULL || startpos == NULL) {
2001   Perl_croak(aTHX_ "NULL regexp parameter");
2002   return 0;
2003  }
2004
2005  multiline = prog->extflags & RXf_PMf_MULTILINE;
2006  reginfo.prog = rx;  /* Yes, sorry that this is confusing.  */
2007
2008  RX_MATCH_UTF8_set(rx, utf8_target);
2009  DEBUG_EXECUTE_r(
2010   debug_start_match(rx, utf8_target, startpos, strend,
2011   "Matching");
2012  );
2013
2014  minlen = prog->minlen;
2015
2016  if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2017   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2018        "String too short [regexec_flags]...\n"));
2019   goto phooey;
2020  }
2021
2022
2023  /* Check validity of program. */
2024  if (UCHARAT(progi->program) != REG_MAGIC) {
2025   Perl_croak(aTHX_ "corrupted regexp program");
2026  }
2027
2028  PL_reg_flags = 0;
2029  PL_reg_eval_set = 0;
2030  PL_reg_maxiter = 0;
2031
2032  if (RX_UTF8(rx))
2033   PL_reg_flags |= RF_utf8;
2034
2035  /* Mark beginning of line for ^ and lookbehind. */
2036  reginfo.bol = startpos; /* XXX not used ??? */
2037  PL_bostr  = strbeg;
2038  reginfo.sv = sv;
2039
2040  /* Mark end of line for $ (and such) */
2041  PL_regeol = strend;
2042
2043  /* see how far we have to get to not match where we matched before */
2044  reginfo.till = startpos+minend;
2045
2046  /* If there is a "must appear" string, look for it. */
2047  s = startpos;
2048
2049  if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2050   MAGIC *mg;
2051   if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2052    reginfo.ganch = startpos + prog->gofs;
2053    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2054    "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2055   } else if (sv && SvTYPE(sv) >= SVt_PVMG
2056     && SvMAGIC(sv)
2057     && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2058     && mg->mg_len >= 0) {
2059    reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2060    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2061     "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2062
2063    if (prog->extflags & RXf_ANCH_GPOS) {
2064     if (s > reginfo.ganch)
2065      goto phooey;
2066     s = reginfo.ganch - prog->gofs;
2067     DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2068      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2069     if (s < strbeg)
2070      goto phooey;
2071    }
2072   }
2073   else if (data) {
2074    reginfo.ganch = strbeg + PTR2UV(data);
2075    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2076     "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2077
2078   } else {    /* pos() not defined */
2079    reginfo.ganch = strbeg;
2080    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2081     "GPOS: reginfo.ganch = strbeg\n"));
2082   }
2083  }
2084  if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2085   /* We have to be careful. If the previous successful match
2086   was from this regex we don't want a subsequent partially
2087   successful match to clobber the old results.
2088   So when we detect this possibility we add a swap buffer
2089   to the re, and switch the buffer each match. If we fail
2090   we switch it back, otherwise we leave it swapped.
2091   */
2092   swap = prog->offs;
2093   /* do we need a save destructor here for eval dies? */
2094   Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2095  }
2096  if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2097   re_scream_pos_data d;
2098
2099   d.scream_olds = &scream_olds;
2100   d.scream_pos = &scream_pos;
2101   s = re_intuit_start(rx, sv, s, strend, flags, &d);
2102   if (!s) {
2103    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2104    goto phooey; /* not present */
2105   }
2106  }
2107
2108
2109
2110  /* Simplest case:  anchored match need be tried only once. */
2111  /*  [unless only anchor is BOL and multiline is set] */
2112  if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2113   if (s == startpos && regtry(&reginfo, &startpos))
2114    goto got_it;
2115   else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2116     || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2117   {
2118    char *end;
2119
2120    if (minlen)
2121     dontbother = minlen - 1;
2122    end = HOP3c(strend, -dontbother, strbeg) - 1;
2123    /* for multiline we only have to try after newlines */
2124    if (prog->check_substr || prog->check_utf8) {
2125     /* because of the goto we can not easily reuse the macros for bifurcating the
2126     unicode/non-unicode match modes here like we do elsewhere - demerphq */
2127     if (utf8_target) {
2128      if (s == startpos)
2129       goto after_try_utf8;
2130      while (1) {
2131       if (regtry(&reginfo, &s)) {
2132        goto got_it;
2133       }
2134      after_try_utf8:
2135       if (s > end) {
2136        goto phooey;
2137       }
2138       if (prog->extflags & RXf_USE_INTUIT) {
2139        s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2140        if (!s) {
2141         goto phooey;
2142        }
2143       }
2144       else {
2145        s += UTF8SKIP(s);
2146       }
2147      }
2148     } /* end search for check string in unicode */
2149     else {
2150      if (s == startpos) {
2151       goto after_try_latin;
2152      }
2153      while (1) {
2154       if (regtry(&reginfo, &s)) {
2155        goto got_it;
2156       }
2157      after_try_latin:
2158       if (s > end) {
2159        goto phooey;
2160       }
2161       if (prog->extflags & RXf_USE_INTUIT) {
2162        s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2163        if (!s) {
2164         goto phooey;
2165        }
2166       }
2167       else {
2168        s++;
2169       }
2170      }
2171     } /* end search for check string in latin*/
2172    } /* end search for check string */
2173    else { /* search for newline */
2174     if (s > startpos) {
2175      /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2176      s--;
2177     }
2178     /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2179     while (s < end) {
2180      if (*s++ == '\n') { /* don't need PL_utf8skip here */
2181       if (regtry(&reginfo, &s))
2182        goto got_it;
2183      }
2184     }
2185    } /* end search for newline */
2186   } /* end anchored/multiline check string search */
2187   goto phooey;
2188  } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2189  {
2190   /* the warning about reginfo.ganch being used without initialization
2191   is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2192   and we only enter this block when the same bit is set. */
2193   char *tmp_s = reginfo.ganch - prog->gofs;
2194
2195   if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2196    goto got_it;
2197   goto phooey;
2198  }
2199
2200  /* Messy cases:  unanchored match. */
2201  if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2202   /* we have /x+whatever/ */
2203   /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2204   char ch;
2205 #ifdef DEBUGGING
2206   int did_match = 0;
2207 #endif
2208   if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2209    utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2210   ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2211
2212   if (utf8_target) {
2213    REXEC_FBC_SCAN(
2214     if (*s == ch) {
2215      DEBUG_EXECUTE_r( did_match = 1 );
2216      if (regtry(&reginfo, &s)) goto got_it;
2217      s += UTF8SKIP(s);
2218      while (s < strend && *s == ch)
2219       s += UTF8SKIP(s);
2220     }
2221    );
2222   }
2223   else {
2224    REXEC_FBC_SCAN(
2225     if (*s == ch) {
2226      DEBUG_EXECUTE_r( did_match = 1 );
2227      if (regtry(&reginfo, &s)) goto got_it;
2228      s++;
2229      while (s < strend && *s == ch)
2230       s++;
2231     }
2232    );
2233   }
2234   DEBUG_EXECUTE_r(if (!did_match)
2235     PerlIO_printf(Perl_debug_log,
2236         "Did not find anchored character...\n")
2237    );
2238  }
2239  else if (prog->anchored_substr != NULL
2240    || prog->anchored_utf8 != NULL
2241    || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2242     && prog->float_max_offset < strend - s)) {
2243   SV *must;
2244   I32 back_max;
2245   I32 back_min;
2246   char *last;
2247   char *last1;  /* Last position checked before */
2248 #ifdef DEBUGGING
2249   int did_match = 0;
2250 #endif
2251   if (prog->anchored_substr || prog->anchored_utf8) {
2252    if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2253     utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2254    must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2255    back_max = back_min = prog->anchored_offset;
2256   } else {
2257    if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2258     utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2259    must = utf8_target ? prog->float_utf8 : prog->float_substr;
2260    back_max = prog->float_max_offset;
2261    back_min = prog->float_min_offset;
2262   }
2263
2264
2265   if (must == &PL_sv_undef)
2266    /* could not downgrade utf8 check substring, so must fail */
2267    goto phooey;
2268
2269   if (back_min<0) {
2270    last = strend;
2271   } else {
2272    last = HOP3c(strend, /* Cannot start after this */
2273     -(I32)(CHR_SVLEN(must)
2274       - (SvTAIL(must) != 0) + back_min), strbeg);
2275   }
2276   if (s > PL_bostr)
2277    last1 = HOPc(s, -1);
2278   else
2279    last1 = s - 1; /* bogus */
2280
2281   /* XXXX check_substr already used to find "s", can optimize if
2282   check_substr==must. */
2283   scream_pos = -1;
2284   dontbother = end_shift;
2285   strend = HOPc(strend, -dontbother);
2286   while ( (s <= last) &&
2287     ((flags & REXEC_SCREAM)
2288     ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2289          end_shift, &scream_pos, 0))
2290     : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2291         (unsigned char*)strend, must,
2292         multiline ? FBMrf_MULTILINE : 0))) ) {
2293    /* we may be pointing at the wrong string */
2294    if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2295     s = strbeg + (s - SvPVX_const(sv));
2296    DEBUG_EXECUTE_r( did_match = 1 );
2297    if (HOPc(s, -back_max) > last1) {
2298     last1 = HOPc(s, -back_min);
2299     s = HOPc(s, -back_max);
2300    }
2301    else {
2302     char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2303
2304     last1 = HOPc(s, -back_min);
2305     s = t;
2306    }
2307    if (utf8_target) {
2308     while (s <= last1) {
2309      if (regtry(&reginfo, &s))
2310       goto got_it;
2311      s += UTF8SKIP(s);
2312     }
2313    }
2314    else {
2315     while (s <= last1) {
2316      if (regtry(&reginfo, &s))
2317       goto got_it;
2318      s++;
2319     }
2320    }
2321   }
2322   DEBUG_EXECUTE_r(if (!did_match) {
2323    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2324     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2325    PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2326        ((must == prog->anchored_substr || must == prog->anchored_utf8)
2327        ? "anchored" : "floating"),
2328     quoted, RE_SV_TAIL(must));
2329   });
2330   goto phooey;
2331  }
2332  else if ( (c = progi->regstclass) ) {
2333   if (minlen) {
2334    const OPCODE op = OP(progi->regstclass);
2335    /* don't bother with what can't match */
2336    if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2337     strend = HOPc(strend, -(minlen - 1));
2338   }
2339   DEBUG_EXECUTE_r({
2340    SV * const prop = sv_newmortal();
2341    regprop(prog, prop, c);
2342    {
2343     RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2344      s,strend-s,60);
2345     PerlIO_printf(Perl_debug_log,
2346      "Matching stclass %.*s against %s (%d bytes)\n",
2347      (int)SvCUR(prop), SvPVX_const(prop),
2348      quoted, (int)(strend - s));
2349    }
2350   });
2351   if (find_byclass(prog, c, s, strend, &reginfo))
2352    goto got_it;
2353   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2354  }
2355  else {
2356   dontbother = 0;
2357   if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2358    /* Trim the end. */
2359    char *last;
2360    SV* float_real;
2361
2362    if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2363     utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2364    float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2365
2366    if (flags & REXEC_SCREAM) {
2367     last = screaminstr(sv, float_real, s - strbeg,
2368         end_shift, &scream_pos, 1); /* last one */
2369     if (!last)
2370      last = scream_olds; /* Only one occurrence. */
2371     /* we may be pointing at the wrong string */
2372     else if (RXp_MATCH_COPIED(prog))
2373      s = strbeg + (s - SvPVX_const(sv));
2374    }
2375    else {
2376     STRLEN len;
2377     const char * const little = SvPV_const(float_real, len);
2378
2379     if (SvTAIL(float_real)) {
2380      if (memEQ(strend - len + 1, little, len - 1))
2381       last = strend - len + 1;
2382      else if (!multiline)
2383       last = memEQ(strend - len, little, len)
2384        ? strend - len : NULL;
2385      else
2386       goto find_last;
2387     } else {
2388     find_last:
2389      if (len)
2390       last = rninstr(s, strend, little, little + len);
2391      else
2392       last = strend; /* matching "$" */
2393     }
2394    }
2395    if (last == NULL) {
2396     DEBUG_EXECUTE_r(
2397      PerlIO_printf(Perl_debug_log,
2398       "%sCan't trim the tail, match fails (should not happen)%s\n",
2399       PL_colors[4], PL_colors[5]));
2400     goto phooey; /* Should not happen! */
2401    }
2402    dontbother = strend - last + prog->float_min_offset;
2403   }
2404   if (minlen && (dontbother < minlen))
2405    dontbother = minlen - 1;
2406   strend -= dontbother;      /* this one's always in bytes! */
2407   /* We don't know much -- general case. */
2408   if (utf8_target) {
2409    for (;;) {
2410     if (regtry(&reginfo, &s))
2411      goto got_it;
2412     if (s >= strend)
2413      break;
2414     s += UTF8SKIP(s);
2415    };
2416   }
2417   else {
2418    do {
2419     if (regtry(&reginfo, &s))
2420      goto got_it;
2421    } while (s++ < strend);
2422   }
2423  }
2424
2425  /* Failure. */
2426  goto phooey;
2427
2428 got_it:
2429  Safefree(swap);
2430  RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2431
2432  if (PL_reg_eval_set)
2433   restore_pos(aTHX_ prog);
2434  if (RXp_PAREN_NAMES(prog))
2435   (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2436
2437  /* make sure $`, $&, $', and $digit will work later */
2438  if ( !(flags & REXEC_NOT_FIRST) ) {
2439   RX_MATCH_COPY_FREE(rx);
2440   if (flags & REXEC_COPY_STR) {
2441    const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2442 #ifdef PERL_OLD_COPY_ON_WRITE
2443    if ((SvIsCOW(sv)
2444     || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2445     if (DEBUG_C_TEST) {
2446      PerlIO_printf(Perl_debug_log,
2447         "Copy on write: regexp capture, type %d\n",
2448         (int) SvTYPE(sv));
2449     }
2450     prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2451     prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2452     assert (SvPOKp(prog->saved_copy));
2453    } else
2454 #endif
2455    {
2456     RX_MATCH_COPIED_on(rx);
2457     s = savepvn(strbeg, i);
2458     prog->subbeg = s;
2459    }
2460    prog->sublen = i;
2461   }
2462   else {
2463    prog->subbeg = strbeg;
2464    prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2465   }
2466  }
2467
2468  return 1;
2469
2470 phooey:
2471  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2472       PL_colors[4], PL_colors[5]));
2473  if (PL_reg_eval_set)
2474   restore_pos(aTHX_ prog);
2475  if (swap) {
2476   /* we failed :-( roll it back */
2477   Safefree(prog->offs);
2478   prog->offs = swap;
2479  }
2480
2481  return 0;
2482 }
2483
2484
2485 /*
2486  - regtry - try match at specific point
2487  */
2488 STATIC I32   /* 0 failure, 1 success */
2489 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2490 {
2491  dVAR;
2492  CHECKPOINT lastcp;
2493  REGEXP *const rx = reginfo->prog;
2494  regexp *const prog = (struct regexp *)SvANY(rx);
2495  RXi_GET_DECL(prog,progi);
2496  GET_RE_DEBUG_FLAGS_DECL;
2497
2498  PERL_ARGS_ASSERT_REGTRY;
2499
2500  reginfo->cutpoint=NULL;
2501
2502  if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2503   MAGIC *mg;
2504
2505   PL_reg_eval_set = RS_init;
2506   DEBUG_EXECUTE_r(DEBUG_s(
2507    PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2508       (IV)(PL_stack_sp - PL_stack_base));
2509    ));
2510   SAVESTACK_CXPOS();
2511   cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2512   /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2513   SAVETMPS;
2514   /* Apparently this is not needed, judging by wantarray. */
2515   /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2516   cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2517
2518   if (reginfo->sv) {
2519    /* Make $_ available to executed code. */
2520    if (reginfo->sv != DEFSV) {
2521     SAVE_DEFSV;
2522     DEFSV_set(reginfo->sv);
2523    }
2524
2525    if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2526     && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2527     /* prepare for quick setting of pos */
2528 #ifdef PERL_OLD_COPY_ON_WRITE
2529     if (SvIsCOW(reginfo->sv))
2530      sv_force_normal_flags(reginfo->sv, 0);
2531 #endif
2532     mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2533         &PL_vtbl_mglob, NULL, 0);
2534     mg->mg_len = -1;
2535    }
2536    PL_reg_magic    = mg;
2537    PL_reg_oldpos   = mg->mg_len;
2538    SAVEDESTRUCTOR_X(restore_pos, prog);
2539   }
2540   if (!PL_reg_curpm) {
2541    Newxz(PL_reg_curpm, 1, PMOP);
2542 #ifdef USE_ITHREADS
2543    {
2544     SV* const repointer = &PL_sv_undef;
2545     /* this regexp is also owned by the new PL_reg_curpm, which
2546     will try to free it.  */
2547     av_push(PL_regex_padav, repointer);
2548     PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2549     PL_regex_pad = AvARRAY(PL_regex_padav);
2550    }
2551 #endif
2552   }
2553 #ifdef USE_ITHREADS
2554   /* It seems that non-ithreads works both with and without this code.
2555   So for efficiency reasons it seems best not to have the code
2556   compiled when it is not needed.  */
2557   /* This is safe against NULLs: */
2558   ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2559   /* PM_reg_curpm owns a reference to this regexp.  */
2560   (void)ReREFCNT_inc(rx);
2561 #endif
2562   PM_SETRE(PL_reg_curpm, rx);
2563   PL_reg_oldcurpm = PL_curpm;
2564   PL_curpm = PL_reg_curpm;
2565   if (RXp_MATCH_COPIED(prog)) {
2566    /*  Here is a serious problem: we cannot rewrite subbeg,
2567     since it may be needed if this match fails.  Thus
2568     $` inside (?{}) could fail... */
2569    PL_reg_oldsaved = prog->subbeg;
2570    PL_reg_oldsavedlen = prog->sublen;
2571 #ifdef PERL_OLD_COPY_ON_WRITE
2572    PL_nrs = prog->saved_copy;
2573 #endif
2574    RXp_MATCH_COPIED_off(prog);
2575   }
2576   else
2577    PL_reg_oldsaved = NULL;
2578   prog->subbeg = PL_bostr;
2579   prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2580  }
2581  DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2582  prog->offs[0].start = *startpos - PL_bostr;
2583  PL_reginput = *startpos;
2584  PL_reglastparen = &prog->lastparen;
2585  PL_reglastcloseparen = &prog->lastcloseparen;
2586  prog->lastparen = 0;
2587  prog->lastcloseparen = 0;
2588  PL_regsize = 0;
2589  PL_regoffs = prog->offs;
2590  if (PL_reg_start_tmpl <= prog->nparens) {
2591   PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2592   if(PL_reg_start_tmp)
2593    Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2594   else
2595    Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2596  }
2597
2598  /* XXXX What this code is doing here?!!!  There should be no need
2599  to do this again and again, PL_reglastparen should take care of
2600  this!  --ilya*/
2601
2602  /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2603  * Actually, the code in regcppop() (which Ilya may be meaning by
2604  * PL_reglastparen), is not needed at all by the test suite
2605  * (op/regexp, op/pat, op/split), but that code is needed otherwise
2606  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2607  * Meanwhile, this code *is* needed for the
2608  * above-mentioned test suite tests to succeed.  The common theme
2609  * on those tests seems to be returning null fields from matches.
2610  * --jhi updated by dapm */
2611 #if 1
2612  if (prog->nparens) {
2613   regexp_paren_pair *pp = PL_regoffs;
2614   register I32 i;
2615   for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2616    ++pp;
2617    pp->start = -1;
2618    pp->end = -1;
2619   }
2620  }
2621 #endif
2622  REGCP_SET(lastcp);
2623  if (regmatch(reginfo, progi->program + 1)) {
2624   PL_regoffs[0].end = PL_reginput - PL_bostr;
2625   return 1;
2626  }
2627  if (reginfo->cutpoint)
2628   *startpos= reginfo->cutpoint;
2629  REGCP_UNWIND(lastcp);
2630  return 0;
2631 }
2632
2633
2634 #define sayYES goto yes
2635 #define sayNO goto no
2636 #define sayNO_SILENT goto no_silent
2637
2638 /* we dont use STMT_START/END here because it leads to
2639    "unreachable code" warnings, which are bogus, but distracting. */
2640 #define CACHEsayNO \
2641  if (ST.cache_mask) \
2642  PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2643  sayNO
2644
2645 /* this is used to determine how far from the left messages like
2646    'failed...' are printed. It should be set such that messages
2647    are inline with the regop output that created them.
2648 */
2649 #define REPORT_CODE_OFF 32
2650
2651
2652 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2653 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2654
2655 #define SLAB_FIRST(s) (&(s)->states[0])
2656 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2657
2658 /* grab a new slab and return the first slot in it */
2659
2660 STATIC regmatch_state *
2661 S_push_slab(pTHX)
2662 {
2663 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2664  dMY_CXT;
2665 #endif
2666  regmatch_slab *s = PL_regmatch_slab->next;
2667  if (!s) {
2668   Newx(s, 1, regmatch_slab);
2669   s->prev = PL_regmatch_slab;
2670   s->next = NULL;
2671   PL_regmatch_slab->next = s;
2672  }
2673  PL_regmatch_slab = s;
2674  return SLAB_FIRST(s);
2675 }
2676
2677
2678 /* push a new state then goto it */
2679
2680 #define PUSH_STATE_GOTO(state, node) \
2681  scan = node; \
2682  st->resume_state = state; \
2683  goto push_state;
2684
2685 /* push a new state with success backtracking, then goto it */
2686
2687 #define PUSH_YES_STATE_GOTO(state, node) \
2688  scan = node; \
2689  st->resume_state = state; \
2690  goto push_yes_state;
2691
2692
2693
2694 /*
2695
2696 regmatch() - main matching routine
2697
2698 This is basically one big switch statement in a loop. We execute an op,
2699 set 'next' to point the next op, and continue. If we come to a point which
2700 we may need to backtrack to on failure such as (A|B|C), we push a
2701 backtrack state onto the backtrack stack. On failure, we pop the top
2702 state, and re-enter the loop at the state indicated. If there are no more
2703 states to pop, we return failure.
2704
2705 Sometimes we also need to backtrack on success; for example /A+/, where
2706 after successfully matching one A, we need to go back and try to
2707 match another one; similarly for lookahead assertions: if the assertion
2708 completes successfully, we backtrack to the state just before the assertion
2709 and then carry on.  In these cases, the pushed state is marked as
2710 'backtrack on success too'. This marking is in fact done by a chain of
2711 pointers, each pointing to the previous 'yes' state. On success, we pop to
2712 the nearest yes state, discarding any intermediate failure-only states.
2713 Sometimes a yes state is pushed just to force some cleanup code to be
2714 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2715 it to free the inner regex.
2716
2717 Note that failure backtracking rewinds the cursor position, while
2718 success backtracking leaves it alone.
2719
2720 A pattern is complete when the END op is executed, while a subpattern
2721 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2722 ops trigger the "pop to last yes state if any, otherwise return true"
2723 behaviour.
2724
2725 A common convention in this function is to use A and B to refer to the two
2726 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2727 the subpattern to be matched possibly multiple times, while B is the entire
2728 rest of the pattern. Variable and state names reflect this convention.
2729
2730 The states in the main switch are the union of ops and failure/success of
2731 substates associated with with that op.  For example, IFMATCH is the op
2732 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2733 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2734 successfully matched A and IFMATCH_A_fail is a state saying that we have
2735 just failed to match A. Resume states always come in pairs. The backtrack
2736 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2737 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2738 on success or failure.
2739
2740 The struct that holds a backtracking state is actually a big union, with
2741 one variant for each major type of op. The variable st points to the
2742 top-most backtrack struct. To make the code clearer, within each
2743 block of code we #define ST to alias the relevant union.
2744
2745 Here's a concrete example of a (vastly oversimplified) IFMATCH
2746 implementation:
2747
2748  switch (state) {
2749  ....
2750
2751 #define ST st->u.ifmatch
2752
2753  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2754   ST.foo = ...; // some state we wish to save
2755   ...
2756   // push a yes backtrack state with a resume value of
2757   // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2758   // first node of A:
2759   PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2760   // NOTREACHED
2761
2762  case IFMATCH_A: // we have successfully executed A; now continue with B
2763   next = B;
2764   bar = ST.foo; // do something with the preserved value
2765   break;
2766
2767  case IFMATCH_A_fail: // A failed, so the assertion failed
2768   ...;   // do some housekeeping, then ...
2769   sayNO; // propagate the failure
2770
2771 #undef ST
2772
2773  ...
2774  }
2775
2776 For any old-timers reading this who are familiar with the old recursive
2777 approach, the code above is equivalent to:
2778
2779  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2780  {
2781   int foo = ...
2782   ...
2783   if (regmatch(A)) {
2784    next = B;
2785    bar = foo;
2786    break;
2787   }
2788   ...;   // do some housekeeping, then ...
2789   sayNO; // propagate the failure
2790  }
2791
2792 The topmost backtrack state, pointed to by st, is usually free. If you
2793 want to claim it, populate any ST.foo fields in it with values you wish to
2794 save, then do one of
2795
2796   PUSH_STATE_GOTO(resume_state, node);
2797   PUSH_YES_STATE_GOTO(resume_state, node);
2798
2799 which sets that backtrack state's resume value to 'resume_state', pushes a
2800 new free entry to the top of the backtrack stack, then goes to 'node'.
2801 On backtracking, the free slot is popped, and the saved state becomes the
2802 new free state. An ST.foo field in this new top state can be temporarily
2803 accessed to retrieve values, but once the main loop is re-entered, it
2804 becomes available for reuse.
2805
2806 Note that the depth of the backtrack stack constantly increases during the
2807 left-to-right execution of the pattern, rather than going up and down with
2808 the pattern nesting. For example the stack is at its maximum at Z at the
2809 end of the pattern, rather than at X in the following:
2810
2811  /(((X)+)+)+....(Y)+....Z/
2812
2813 The only exceptions to this are lookahead/behind assertions and the cut,
2814 (?>A), which pop all the backtrack states associated with A before
2815 continuing.
2816
2817 Backtrack state structs are allocated in slabs of about 4K in size.
2818 PL_regmatch_state and st always point to the currently active state,
2819 and PL_regmatch_slab points to the slab currently containing
2820 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2821 allocated, and is never freed until interpreter destruction. When the slab
2822 is full, a new one is allocated and chained to the end. At exit from
2823 regmatch(), slabs allocated since entry are freed.
2824
2825 */
2826
2827
2828 #define DEBUG_STATE_pp(pp)        \
2829  DEBUG_STATE_r({         \
2830   DUMP_EXEC_POS(locinput, scan, utf8_target);      \
2831   PerlIO_printf(Perl_debug_log,       \
2832    "    %*s"pp" %s%s%s%s%s\n",       \
2833    depth*2, "",        \
2834    PL_reg_name[st->resume_state],                     \
2835    ((st==yes_state||st==mark_state) ? "[" : ""),   \
2836    ((st==yes_state) ? "Y" : ""),                   \
2837    ((st==mark_state) ? "M" : ""),                  \
2838    ((st==yes_state||st==mark_state) ? "]" : "")    \
2839   );                                                  \
2840  });
2841
2842
2843 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2844
2845 #ifdef DEBUGGING
2846
2847 STATIC void
2848 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2849  const char *start, const char *end, const char *blurb)
2850 {
2851  const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2852
2853  PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2854
2855  if (!PL_colorset)
2856    reginitcolors();
2857  {
2858   RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2859    RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2860
2861   RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2862    start, end - start, 60);
2863
2864   PerlIO_printf(Perl_debug_log,
2865    "%s%s REx%s %s against %s\n",
2866      PL_colors[4], blurb, PL_colors[5], s0, s1);
2867
2868   if (utf8_target||utf8_pat)
2869    PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2870     utf8_pat ? "pattern" : "",
2871     utf8_pat && utf8_target ? " and " : "",
2872     utf8_target ? "string" : ""
2873    );
2874  }
2875 }
2876
2877 STATIC void
2878 S_dump_exec_pos(pTHX_ const char *locinput,
2879      const regnode *scan,
2880      const char *loc_regeol,
2881      const char *loc_bostr,
2882      const char *loc_reg_starttry,
2883      const bool utf8_target)
2884 {
2885  const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2886  const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2887  int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2888  /* The part of the string before starttry has one color
2889  (pref0_len chars), between starttry and current
2890  position another one (pref_len - pref0_len chars),
2891  after the current position the third one.
2892  We assume that pref0_len <= pref_len, otherwise we
2893  decrease pref0_len.  */
2894  int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2895   ? (5 + taill) - l : locinput - loc_bostr;
2896  int pref0_len;
2897
2898  PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2899
2900  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2901   pref_len++;
2902  pref0_len = pref_len  - (locinput - loc_reg_starttry);
2903  if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2904   l = ( loc_regeol - locinput > (5 + taill) - pref_len
2905    ? (5 + taill) - pref_len : loc_regeol - locinput);
2906  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2907   l--;
2908  if (pref0_len < 0)
2909   pref0_len = 0;
2910  if (pref0_len > pref_len)
2911   pref0_len = pref_len;
2912  {
2913   const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2914
2915   RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2916    (locinput - pref_len),pref0_len, 60, 4, 5);
2917
2918   RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2919      (locinput - pref_len + pref0_len),
2920      pref_len - pref0_len, 60, 2, 3);
2921
2922   RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2923      locinput, loc_regeol - locinput, 10, 0, 1);
2924
2925   const STRLEN tlen=len0+len1+len2;
2926   PerlIO_printf(Perl_debug_log,
2927      "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2928      (IV)(locinput - loc_bostr),
2929      len0, s0,
2930      len1, s1,
2931      (docolor ? "" : "> <"),
2932      len2, s2,
2933      (int)(tlen > 19 ? 0 :  19 - tlen),
2934      "");
2935  }
2936 }
2937
2938 #endif
2939
2940 /* reg_check_named_buff_matched()
2941  * Checks to see if a named buffer has matched. The data array of
2942  * buffer numbers corresponding to the buffer is expected to reside
2943  * in the regexp->data->data array in the slot stored in the ARG() of
2944  * node involved. Note that this routine doesn't actually care about the
2945  * name, that information is not preserved from compilation to execution.
2946  * Returns the index of the leftmost defined buffer with the given name
2947  * or 0 if non of the buffers matched.
2948  */
2949 STATIC I32
2950 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2951 {
2952  I32 n;
2953  RXi_GET_DECL(rex,rexi);
2954  SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2955  I32 *nums=(I32*)SvPVX(sv_dat);
2956
2957  PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2958
2959  for ( n=0; n<SvIVX(sv_dat); n++ ) {
2960   if ((I32)*PL_reglastparen >= nums[n] &&
2961    PL_regoffs[nums[n]].end != -1)
2962   {
2963    return nums[n];
2964   }
2965  }
2966  return 0;
2967 }
2968
2969
2970 /* free all slabs above current one  - called during LEAVE_SCOPE */
2971
2972 STATIC void
2973 S_clear_backtrack_stack(pTHX_ void *p)
2974 {
2975  regmatch_slab *s = PL_regmatch_slab->next;
2976  PERL_UNUSED_ARG(p);
2977
2978  if (!s)
2979   return;
2980  PL_regmatch_slab->next = NULL;
2981  while (s) {
2982   regmatch_slab * const osl = s;
2983   s = s->next;
2984   Safefree(osl);
2985  }
2986 }
2987
2988
2989 #define SETREX(Re1,Re2) \
2990  if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2991  Re1 = (Re2)
2992
2993 STATIC I32   /* 0 failure, 1 success */
2994 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2995 {
2996 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2997  dMY_CXT;
2998 #endif
2999  dVAR;
3000  register const bool utf8_target = PL_reg_match_utf8;
3001  const U32 uniflags = UTF8_ALLOW_DEFAULT;
3002  REGEXP *rex_sv = reginfo->prog;
3003  regexp *rex = (struct regexp *)SvANY(rex_sv);
3004  RXi_GET_DECL(rex,rexi);
3005  I32 oldsave;
3006  /* the current state. This is a cached copy of PL_regmatch_state */
3007  register regmatch_state *st;
3008  /* cache heavy used fields of st in registers */
3009  register regnode *scan;
3010  register regnode *next;
3011  register U32 n = 0; /* general value; init to avoid compiler warning */
3012  register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3013  register char *locinput = PL_reginput;
3014  register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3015
3016  bool result = 0;     /* return value of S_regmatch */
3017  int depth = 0;     /* depth of backtrack stack */
3018  U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3019  const U32 max_nochange_depth =
3020   (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3021   3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3022  regmatch_state *yes_state = NULL; /* state to pop to on success of
3023                subpattern */
3024  /* mark_state piggy backs on the yes_state logic so that when we unwind
3025  the stack on success we can update the mark_state as we go */
3026  regmatch_state *mark_state = NULL; /* last mark state we have seen */
3027  regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3028  struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3029  U32 state_num;
3030  bool no_final = 0;      /* prevent failure from backtracking? */
3031  bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3032  char *startpoint = PL_reginput;
3033  SV *popmark = NULL;     /* are we looking for a mark? */
3034  SV *sv_commit = NULL;   /* last mark name seen in failure */
3035  SV *sv_yes_mark = NULL; /* last mark name we have seen
3036        during a successful match */
3037  U32 lastopen = 0;       /* last open we saw */
3038  bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3039  SV* const oreplsv = GvSV(PL_replgv);
3040  /* these three flags are set by various ops to signal information to
3041  * the very next op. They have a useful lifetime of exactly one loop
3042  * iteration, and are not preserved or restored by state pushes/pops
3043  */
3044  bool sw = 0;     /* the condition value in (?(cond)a|b) */
3045  bool minmod = 0;     /* the next "{n,m}" is a "{n,m}?" */
3046  int logical = 0;     /* the following EVAL is:
3047         0: (?{...})
3048         1: (?(?{...})X|Y)
3049         2: (??{...})
3050        or the following IFMATCH/UNLESSM is:
3051         false: plain (?=foo)
3052         true:  used as a condition: (?(?=foo))
3053        */
3054 #ifdef DEBUGGING
3055  GET_RE_DEBUG_FLAGS_DECL;
3056 #endif
3057
3058  PERL_ARGS_ASSERT_REGMATCH;
3059
3060  DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3061    PerlIO_printf(Perl_debug_log,"regmatch start\n");
3062  }));
3063  /* on first ever call to regmatch, allocate first slab */
3064  if (!PL_regmatch_slab) {
3065   Newx(PL_regmatch_slab, 1, regmatch_slab);
3066   PL_regmatch_slab->prev = NULL;
3067   PL_regmatch_slab->next = NULL;
3068   PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3069  }
3070
3071  oldsave = PL_savestack_ix;
3072  SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3073  SAVEVPTR(PL_regmatch_slab);
3074  SAVEVPTR(PL_regmatch_state);
3075
3076  /* grab next free state slot */
3077  st = ++PL_regmatch_state;
3078  if (st >  SLAB_LAST(PL_regmatch_slab))
3079   st = PL_regmatch_state = S_push_slab(aTHX);
3080
3081  /* Note that nextchr is a byte even in UTF */
3082  nextchr = UCHARAT(locinput);
3083  scan = prog;
3084  while (scan != NULL) {
3085
3086   DEBUG_EXECUTE_r( {
3087    SV * const prop = sv_newmortal();
3088    regnode *rnext=regnext(scan);
3089    DUMP_EXEC_POS( locinput, scan, utf8_target );
3090    regprop(rex, prop, scan);
3091
3092    PerlIO_printf(Perl_debug_log,
3093      "%3"IVdf":%*s%s(%"IVdf")\n",
3094      (IV)(scan - rexi->program), depth*2, "",
3095      SvPVX_const(prop),
3096      (PL_regkind[OP(scan)] == END || !rnext) ?
3097       0 : (IV)(rnext - rexi->program));
3098   });
3099
3100   next = scan + NEXT_OFF(scan);
3101   if (next == scan)
3102    next = NULL;
3103   state_num = OP(scan);
3104
3105   REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3106  reenter_switch:
3107
3108   assert(PL_reglastparen == &rex->lastparen);
3109   assert(PL_reglastcloseparen == &rex->lastcloseparen);
3110   assert(PL_regoffs == rex->offs);
3111
3112   switch (state_num) {
3113   case BOL:
3114    if (locinput == PL_bostr)
3115    {
3116     /* reginfo->till = reginfo->bol; */
3117     break;
3118    }
3119    sayNO;
3120   case MBOL:
3121    if (locinput == PL_bostr ||
3122     ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3123    {
3124     break;
3125    }
3126    sayNO;
3127   case SBOL:
3128    if (locinput == PL_bostr)
3129     break;
3130    sayNO;
3131   case GPOS:
3132    if (locinput == reginfo->ganch)
3133     break;
3134    sayNO;
3135
3136   case KEEPS:
3137    /* update the startpoint */
3138    st->u.keeper.val = PL_regoffs[0].start;
3139    PL_reginput = locinput;
3140    PL_regoffs[0].start = locinput - PL_bostr;
3141    PUSH_STATE_GOTO(KEEPS_next, next);
3142    /*NOT-REACHED*/
3143   case KEEPS_next_fail:
3144    /* rollback the start point change */
3145    PL_regoffs[0].start = st->u.keeper.val;
3146    sayNO_SILENT;
3147    /*NOT-REACHED*/
3148   case EOL:
3149     goto seol;
3150   case MEOL:
3151    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3152     sayNO;
3153    break;
3154   case SEOL:
3155   seol:
3156    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3157     sayNO;
3158    if (PL_regeol - locinput > 1)
3159     sayNO;
3160    break;
3161   case EOS:
3162    if (PL_regeol != locinput)
3163     sayNO;
3164    break;
3165   case SANY:
3166    if (!nextchr && locinput >= PL_regeol)
3167     sayNO;
3168    if (utf8_target) {
3169     locinput += PL_utf8skip[nextchr];
3170     if (locinput > PL_regeol)
3171      sayNO;
3172     nextchr = UCHARAT(locinput);
3173    }
3174    else
3175     nextchr = UCHARAT(++locinput);
3176    break;
3177   case CANY:
3178    if (!nextchr && locinput >= PL_regeol)
3179     sayNO;
3180    nextchr = UCHARAT(++locinput);
3181    break;
3182   case REG_ANY:
3183    if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3184     sayNO;
3185    if (utf8_target) {
3186     locinput += PL_utf8skip[nextchr];
3187     if (locinput > PL_regeol)
3188      sayNO;
3189     nextchr = UCHARAT(locinput);
3190    }
3191    else
3192     nextchr = UCHARAT(++locinput);
3193    break;
3194
3195 #undef  ST
3196 #define ST st->u.trie
3197   case TRIEC:
3198    /* In this case the charclass data is available inline so
3199    we can fail fast without a lot of extra overhead.
3200    */
3201    if (scan->flags == EXACT || !utf8_target) {
3202     if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3203      DEBUG_EXECUTE_r(
3204       PerlIO_printf(Perl_debug_log,
3205          "%*s  %sfailed to match trie start class...%s\n",
3206          REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3207      );
3208      sayNO_SILENT;
3209      /* NOTREACHED */
3210     }
3211    }
3212    /* FALL THROUGH */
3213   case TRIE:
3214    /* the basic plan of execution of the trie is:
3215    * At the beginning, run though all the states, and
3216    * find the longest-matching word. Also remember the position
3217    * of the shortest matching word. For example, this pattern:
3218    *    1  2 3 4    5
3219    *    ab|a|x|abcd|abc
3220    * when matched against the string "abcde", will generate
3221    * accept states for all words except 3, with the longest
3222    * matching word being 4, and the shortest being 1 (with
3223    * the position being after char 1 of the string).
3224    *
3225    * Then for each matching word, in word order (i.e. 1,2,4,5),
3226    * we run the remainder of the pattern; on each try setting
3227    * the current position to the character following the word,
3228    * returning to try the next word on failure.
3229    *
3230    * We avoid having to build a list of words at runtime by
3231    * using a compile-time structure, wordinfo[].prev, which
3232    * gives, for each word, the previous accepting word (if any).
3233    * In the case above it would contain the mappings 1->2, 2->0,
3234    * 3->0, 4->5, 5->1.  We can use this table to generate, from
3235    * the longest word (4 above), a list of all words, by
3236    * following the list of prev pointers; this gives us the
3237    * unordered list 4,5,1,2. Then given the current word we have
3238    * just tried, we can go through the list and find the
3239    * next-biggest word to try (so if we just failed on word 2,
3240    * the next in the list is 4).
3241    *
3242    * Since at runtime we don't record the matching position in
3243    * the string for each word, we have to work that out for
3244    * each word we're about to process. The wordinfo table holds
3245    * the character length of each word; given that we recorded
3246    * at the start: the position of the shortest word and its
3247    * length in chars, we just need to move the pointer the
3248    * difference between the two char lengths. Depending on
3249    * Unicode status and folding, that's cheap or expensive.
3250    *
3251    * This algorithm is optimised for the case where are only a
3252    * small number of accept states, i.e. 0,1, or maybe 2.
3253    * With lots of accepts states, and having to try all of them,
3254    * it becomes quadratic on number of accept states to find all
3255    * the next words.
3256    */
3257
3258    {
3259     /* what type of TRIE am I? (utf8 makes this contextual) */
3260     DECL_TRIE_TYPE(scan);
3261
3262     /* what trie are we using right now */
3263     reg_trie_data * const trie
3264      = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3265     HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3266     U32 state = trie->startstate;
3267
3268     if (trie->bitmap && trie_type != trie_utf8_fold &&
3269      !TRIE_BITMAP_TEST(trie,*locinput)
3270     ) {
3271      if (trie->states[ state ].wordnum) {
3272       DEBUG_EXECUTE_r(
3273        PerlIO_printf(Perl_debug_log,
3274           "%*s  %smatched empty string...%s\n",
3275           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3276       );
3277       if (!trie->jump)
3278        break;
3279      } else {
3280       DEBUG_EXECUTE_r(
3281        PerlIO_printf(Perl_debug_log,
3282           "%*s  %sfailed to match trie start class...%s\n",
3283           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3284       );
3285       sayNO_SILENT;
3286     }
3287     }
3288
3289    {
3290     U8 *uc = ( U8* )locinput;
3291
3292     STRLEN len = 0;
3293     STRLEN foldlen = 0;
3294     U8 *uscan = (U8*)NULL;
3295     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3296     U32 charcount = 0; /* how many input chars we have matched */
3297     U32 accepted = 0; /* have we seen any accepting states? */
3298
3299     ST.B = next;
3300     ST.jump = trie->jump;
3301     ST.me = scan;
3302     ST.firstpos = NULL;
3303     ST.longfold = FALSE; /* char longer if folded => it's harder */
3304     ST.nextword = 0;
3305
3306     /* fully traverse the TRIE; note the position of the
3307     shortest accept state and the wordnum of the longest
3308     accept state */
3309
3310     while ( state && uc <= (U8*)PL_regeol ) {
3311      U32 base = trie->states[ state ].trans.base;
3312      UV uvc = 0;
3313      U16 charid = 0;
3314      U16 wordnum;
3315      wordnum = trie->states[ state ].wordnum;
3316
3317      if (wordnum) { /* it's an accept state */
3318       if (!accepted) {
3319        accepted = 1;
3320        /* record first match position */
3321        if (ST.longfold) {
3322         ST.firstpos = (U8*)locinput;
3323         ST.firstchars = 0;
3324        }
3325        else {
3326         ST.firstpos = uc;
3327         ST.firstchars = charcount;
3328        }
3329       }
3330       if (!ST.nextword || wordnum < ST.nextword)
3331        ST.nextword = wordnum;
3332       ST.topword = wordnum;
3333      }
3334
3335      DEBUG_TRIE_EXECUTE_r({
3336         DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3337         PerlIO_printf( Perl_debug_log,
3338          "%*s  %sState: %4"UVxf" Accepted: %c ",
3339          2+depth * 2, "", PL_colors[4],
3340          (UV)state, (accepted ? 'Y' : 'N'));
3341      });
3342
3343      /* read a char and goto next state */
3344      if ( base ) {
3345       I32 offset;
3346       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3347            uscan, len, uvc, charid, foldlen,
3348            foldbuf, uniflags);
3349       charcount++;
3350       if (foldlen>0)
3351        ST.longfold = TRUE;
3352       if (charid &&
3353        ( ((offset =
3354        base + charid - 1 - trie->uniquecharcount)) >= 0)
3355
3356        && ((U32)offset < trie->lasttrans)
3357        && trie->trans[offset].check == state)
3358       {
3359        state = trie->trans[offset].next;
3360       }
3361       else {
3362        state = 0;
3363       }
3364       uc += len;
3365
3366      }
3367      else {
3368       state = 0;
3369      }
3370      DEBUG_TRIE_EXECUTE_r(
3371       PerlIO_printf( Perl_debug_log,
3372        "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3373        charid, uvc, (UV)state, PL_colors[5] );
3374      );
3375     }
3376     if (!accepted)
3377     sayNO;
3378
3379     /* calculate total number of accept states */
3380     {
3381      U16 w = ST.topword;
3382      accepted = 0;
3383      while (w) {
3384       w = trie->wordinfo[w].prev;
3385       accepted++;
3386      }
3387      ST.accepted = accepted;
3388     }
3389
3390     DEBUG_EXECUTE_r(
3391      PerlIO_printf( Perl_debug_log,
3392       "%*s  %sgot %"IVdf" possible matches%s\n",
3393       REPORT_CODE_OFF + depth * 2, "",
3394       PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3395     );
3396     goto trie_first_try; /* jump into the fail handler */
3397    }}
3398    /* NOTREACHED */
3399
3400   case TRIE_next_fail: /* we failed - try next alternative */
3401    if ( ST.jump) {
3402     REGCP_UNWIND(ST.cp);
3403     for (n = *PL_reglastparen; n > ST.lastparen; n--)
3404      PL_regoffs[n].end = -1;
3405     *PL_reglastparen = n;
3406    }
3407    if (!--ST.accepted) {
3408     DEBUG_EXECUTE_r({
3409      PerlIO_printf( Perl_debug_log,
3410       "%*s  %sTRIE failed...%s\n",
3411       REPORT_CODE_OFF+depth*2, "",
3412       PL_colors[4],
3413       PL_colors[5] );
3414     });
3415     sayNO_SILENT;
3416    }
3417    {
3418     /* Find next-highest word to process.  Note that this code
3419     * is O(N^2) per trie run (O(N) per branch), so keep tight */
3420     register U16 min = 0;
3421     register U16 word;
3422     register U16 const nextword = ST.nextword;
3423     register reg_trie_wordinfo * const wordinfo
3424      = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3425     for (word=ST.topword; word; word=wordinfo[word].prev) {
3426      if (word > nextword && (!min || word < min))
3427       min = word;
3428     }
3429     ST.nextword = min;
3430    }
3431
3432   trie_first_try:
3433    if (do_cutgroup) {
3434     do_cutgroup = 0;
3435     no_final = 0;
3436    }
3437
3438    if ( ST.jump) {
3439     ST.lastparen = *PL_reglastparen;
3440     REGCP_SET(ST.cp);
3441    }
3442
3443    /* find start char of end of current word */
3444    {
3445     U32 chars; /* how many chars to skip */
3446     U8 *uc = ST.firstpos;
3447     reg_trie_data * const trie
3448      = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3449
3450     assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3451        >=  ST.firstchars);
3452     chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3453        - ST.firstchars;
3454
3455     if (ST.longfold) {
3456      /* the hard option - fold each char in turn and find
3457      * its folded length (which may be different */
3458      U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3459      STRLEN foldlen;
3460      STRLEN len;
3461      UV uvc;
3462      U8 *uscan;
3463
3464      while (chars) {
3465       if (utf8_target) {
3466        uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3467              uniflags);
3468        uc += len;
3469       }
3470       else {
3471        uvc = *uc;
3472        uc++;
3473       }
3474       uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3475       uscan = foldbuf;
3476       while (foldlen) {
3477        if (!--chars)
3478         break;
3479        uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3480            uniflags);
3481        uscan += len;
3482        foldlen -= len;
3483       }
3484      }
3485     }
3486     else {
3487      if (utf8_target)
3488       while (chars--)
3489        uc += UTF8SKIP(uc);
3490      else
3491       uc += chars;
3492     }
3493     PL_reginput = (char *)uc;
3494    }
3495
3496    scan = (ST.jump && ST.jump[ST.nextword])
3497       ? ST.me + ST.jump[ST.nextword]
3498       : ST.B;
3499
3500    DEBUG_EXECUTE_r({
3501     PerlIO_printf( Perl_debug_log,
3502      "%*s  %sTRIE matched word #%d, continuing%s\n",
3503      REPORT_CODE_OFF+depth*2, "",
3504      PL_colors[4],
3505      ST.nextword,
3506      PL_colors[5]
3507      );
3508    });
3509
3510    if (ST.accepted > 1 || has_cutgroup) {
3511     PUSH_STATE_GOTO(TRIE_next, scan);
3512     /* NOTREACHED */
3513    }
3514    /* only one choice left - just continue */
3515    DEBUG_EXECUTE_r({
3516     AV *const trie_words
3517      = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3518     SV ** const tmp = av_fetch( trie_words,
3519      ST.nextword-1, 0 );
3520     SV *sv= tmp ? sv_newmortal() : NULL;
3521
3522     PerlIO_printf( Perl_debug_log,
3523      "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3524      REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3525      ST.nextword,
3526      tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3527        PL_colors[0], PL_colors[1],
3528        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3529       )
3530      : "not compiled under -Dr",
3531      PL_colors[5] );
3532    });
3533
3534    locinput = PL_reginput;
3535    nextchr = UCHARAT(locinput);
3536    continue; /* execute rest of RE */
3537    /* NOTREACHED */
3538 #undef  ST
3539
3540   case EXACT: {
3541    char *s = STRING(scan);
3542    ln = STR_LEN(scan);
3543    if (utf8_target != UTF_PATTERN) {
3544     /* The target and the pattern have differing utf8ness. */
3545     char *l = locinput;
3546     const char * const e = s + ln;
3547
3548     if (utf8_target) {
3549      /* The target is utf8, the pattern is not utf8. */
3550      while (s < e) {
3551       STRLEN ulen;
3552       if (l >= PL_regeol)
3553        sayNO;
3554       if (NATIVE_TO_UNI(*(U8*)s) !=
3555        utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3556            uniflags))
3557        sayNO;
3558       l += ulen;
3559       s ++;
3560      }
3561     }
3562     else {
3563      /* The target is not utf8, the pattern is utf8. */
3564      while (s < e) {
3565       STRLEN ulen;
3566       if (l >= PL_regeol)
3567        sayNO;
3568       if (NATIVE_TO_UNI(*((U8*)l)) !=
3569        utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3570           uniflags))
3571        sayNO;
3572       s += ulen;
3573       l ++;
3574      }
3575     }
3576     locinput = l;
3577     nextchr = UCHARAT(locinput);
3578     break;
3579    }
3580    /* The target and the pattern have the same utf8ness. */
3581    /* Inline the first character, for speed. */
3582    if (UCHARAT(s) != nextchr)
3583     sayNO;
3584    if (PL_regeol - locinput < ln)
3585     sayNO;
3586    if (ln > 1 && memNE(s, locinput, ln))
3587     sayNO;
3588    locinput += ln;
3589    nextchr = UCHARAT(locinput);
3590    break;
3591    }
3592   case EXACTFL: {
3593    re_fold_t folder;
3594    const U8 * fold_array;
3595    const char * s;
3596    U32 fold_utf8_flags;
3597
3598    PL_reg_flags |= RF_tainted;
3599    folder = foldEQ_locale;
3600    fold_array = PL_fold_locale;
3601    fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3602    goto do_exactf;
3603
3604   case EXACTFU:
3605    folder = foldEQ_latin1;
3606    fold_array = PL_fold_latin1;
3607    fold_utf8_flags = 0;
3608    goto do_exactf;
3609
3610   case EXACTFA:
3611    folder = foldEQ_latin1;
3612    fold_array = PL_fold_latin1;
3613    fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3614    goto do_exactf;
3615
3616   case EXACTF:
3617    folder = foldEQ;
3618    fold_array = PL_fold;
3619    fold_utf8_flags = 0;
3620
3621   do_exactf:
3622    s = STRING(scan);
3623    ln = STR_LEN(scan);
3624
3625    if (utf8_target || UTF_PATTERN) {
3626    /* Either target or the pattern are utf8. */
3627     const char * const l = locinput;
3628     char *e = PL_regeol;
3629
3630     if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3631        l, &e, 0,  utf8_target, fold_utf8_flags))
3632     {
3633      sayNO;
3634     }
3635     locinput = e;
3636     nextchr = UCHARAT(locinput);
3637     break;
3638    }
3639
3640    /* Neither the target nor the pattern are utf8 */
3641    if (UCHARAT(s) != nextchr &&
3642     UCHARAT(s) != fold_array[nextchr])
3643    {
3644     sayNO;
3645    }
3646    if (PL_regeol - locinput < ln)
3647     sayNO;
3648    if (ln > 1 && ! folder(s, locinput, ln))
3649     sayNO;
3650    locinput += ln;
3651    nextchr = UCHARAT(locinput);
3652    break;
3653   }
3654
3655   /* XXX Could improve efficiency by separating these all out using a
3656   * macro or in-line function.  At that point regcomp.c would no longer
3657   * have to set the FLAGS fields of these */
3658   case BOUNDL:
3659   case NBOUNDL:
3660    PL_reg_flags |= RF_tainted;
3661    /* FALL THROUGH */
3662   case BOUND:
3663   case BOUNDU:
3664   case BOUNDA:
3665   case NBOUND:
3666   case NBOUNDU:
3667   case NBOUNDA:
3668    /* was last char in word? */
3669    if (utf8_target
3670     && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3671     && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3672    {
3673     if (locinput == PL_bostr)
3674      ln = '\n';
3675     else {
3676      const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3677
3678      ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3679     }
3680     if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3681      ln = isALNUM_uni(ln);
3682      LOAD_UTF8_CHARCLASS_ALNUM();
3683      n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3684     }
3685     else {
3686      ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3687      n = isALNUM_LC_utf8((U8*)locinput);
3688     }
3689    }
3690    else {
3691
3692     /* Here the string isn't utf8, or is utf8 and only ascii
3693     * characters are to match \w.  In the latter case looking at
3694     * the byte just prior to the current one may be just the final
3695     * byte of a multi-byte character.  This is ok.  There are two
3696     * cases:
3697     * 1) it is a single byte character, and then the test is doing
3698     * just what it's supposed to.
3699     * 2) it is a multi-byte character, in which case the final
3700     * byte is never mistakable for ASCII, and so the test
3701     * will say it is not a word character, which is the
3702     * correct answer. */
3703     ln = (locinput != PL_bostr) ?
3704      UCHARAT(locinput - 1) : '\n';
3705     switch (FLAGS(scan)) {
3706      case REGEX_UNICODE_CHARSET:
3707       ln = isWORDCHAR_L1(ln);
3708       n = isWORDCHAR_L1(nextchr);
3709       break;
3710      case REGEX_LOCALE_CHARSET:
3711       ln = isALNUM_LC(ln);
3712       n = isALNUM_LC(nextchr);
3713       break;
3714      case REGEX_DEPENDS_CHARSET:
3715       ln = isALNUM(ln);
3716       n = isALNUM(nextchr);
3717       break;
3718      case REGEX_ASCII_RESTRICTED_CHARSET:
3719      case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3720       ln = isWORDCHAR_A(ln);
3721       n = isWORDCHAR_A(nextchr);
3722       break;
3723      default:
3724       Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3725       break;
3726     }
3727    }
3728    /* Note requires that all BOUNDs be lower than all NBOUNDs in
3729    * regcomp.sym */
3730    if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3731      sayNO;
3732    break;
3733   case ANYOFV:
3734   case ANYOF:
3735    if (utf8_target || state_num == ANYOFV) {
3736     STRLEN inclasslen = PL_regeol - locinput;
3737     if (locinput >= PL_regeol)
3738      sayNO;
3739
3740     if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3741      sayNO;
3742     locinput += inclasslen;
3743     nextchr = UCHARAT(locinput);
3744     break;
3745    }
3746    else {
3747     if (nextchr < 0)
3748      nextchr = UCHARAT(locinput);
3749     if (!nextchr && locinput >= PL_regeol)
3750      sayNO;
3751     if (!REGINCLASS(rex, scan, (U8*)locinput))
3752      sayNO;
3753     nextchr = UCHARAT(++locinput);
3754     break;
3755    }
3756    break;
3757   /* Special char classes - The defines start on line 129 or so */
3758   CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3759     ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3760     ALNUMU, NALNUMU, isWORDCHAR_L1,
3761     ALNUMA, NALNUMA, isWORDCHAR_A,
3762     alnum, "a");
3763
3764   CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3765     SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3766     SPACEU, NSPACEU, isSPACE_L1,
3767     SPACEA, NSPACEA, isSPACE_A,
3768     space, " ");
3769
3770   CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3771     DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3772     DIGITA, NDIGITA, isDIGIT_A,
3773     digit, "0");
3774
3775   case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3776      a Unicode extended Grapheme Cluster */
3777    /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3778    extended Grapheme Cluster is:
3779
3780    CR LF
3781    | Prepend* Begin Extend*
3782    | .
3783
3784    Begin is (Hangul-syllable | ! Control)
3785    Extend is (Grapheme_Extend | Spacing_Mark)
3786    Control is [ GCB_Control CR LF ]
3787
3788    The discussion below shows how the code for CLUMP is derived
3789    from this regex.  Note that most of these concepts are from
3790    property values of the Grapheme Cluster Boundary (GCB) property.
3791    No code point can have multiple property values for a given
3792    property.  Thus a code point in Prepend can't be in Control, but
3793    it must be in !Control.  This is why Control above includes
3794    GCB_Control plus CR plus LF.  The latter two are used in the GCB
3795    property separately, and so can't be in GCB_Control, even though
3796    they logically are controls.  Control is not the same as gc=cc,
3797    but includes format and other characters as well.
3798
3799    The Unicode definition of Hangul-syllable is:
3800     L+
3801     | (L* ( ( V | LV ) V* | LVT ) T*)
3802     | T+
3803     )
3804    Each of these is a value for the GCB property, and hence must be
3805    disjoint, so the order they are tested is immaterial, so the
3806    above can safely be changed to
3807     T+
3808     | L+
3809     | (L* ( LVT | ( V | LV ) V*) T*)
3810
3811    The last two terms can be combined like this:
3812     L* ( L
3813       | (( LVT | ( V | LV ) V*) T*))
3814
3815    And refactored into this:
3816     L* (L | LVT T* | V  V* T* | LV  V* T*)
3817
3818    That means that if we have seen any L's at all we can quit
3819    there, but if the next character is a LVT, a V or and LV we
3820    should keep going.
3821
3822    There is a subtlety with Prepend* which showed up in testing.
3823    Note that the Begin, and only the Begin is required in:
3824     | Prepend* Begin Extend*
3825    Also, Begin contains '! Control'.  A Prepend must be a '!
3826    Control', which means it must be a Begin.  What it comes down to
3827    is that if we match Prepend* and then find no suitable Begin
3828    afterwards, that if we backtrack the last Prepend, that one will
3829    be a suitable Begin.
3830    */
3831
3832    if (locinput >= PL_regeol)
3833     sayNO;
3834    if  (! utf8_target) {
3835
3836     /* Match either CR LF  or '.', as all the other possibilities
3837     * require utf8 */
3838     locinput++;     /* Match the . or CR */
3839     if (nextchr == '\r'
3840      && locinput < PL_regeol
3841      && UCHARAT(locinput) == '\n') locinput++;
3842    }
3843    else {
3844
3845     /* Utf8: See if is ( CR LF ); already know that locinput <
3846     * PL_regeol, so locinput+1 is in bounds */
3847     if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3848      locinput += 2;
3849     }
3850     else {
3851      /* In case have to backtrack to beginning, then match '.' */
3852      char *starting = locinput;
3853
3854      /* In case have to backtrack the last prepend */
3855      char *previous_prepend = 0;
3856
3857      LOAD_UTF8_CHARCLASS_GCB();
3858
3859      /* Match (prepend)* */
3860      while (locinput < PL_regeol
3861       && swash_fetch(PL_utf8_X_prepend,
3862           (U8*)locinput, utf8_target))
3863      {
3864       previous_prepend = locinput;
3865       locinput += UTF8SKIP(locinput);
3866      }
3867
3868      /* As noted above, if we matched a prepend character, but
3869      * the next thing won't match, back off the last prepend we
3870      * matched, as it is guaranteed to match the begin */
3871      if (previous_prepend
3872       && (locinput >=  PL_regeol
3873        || ! swash_fetch(PL_utf8_X_begin,
3874            (U8*)locinput, utf8_target)))
3875      {
3876       locinput = previous_prepend;
3877      }
3878
3879      /* Note that here we know PL_regeol > locinput, as we
3880      * tested that upon input to this switch case, and if we
3881      * moved locinput forward, we tested the result just above
3882      * and it either passed, or we backed off so that it will
3883      * now pass */
3884      if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3885
3886       /* Here did not match the required 'Begin' in the
3887       * second term.  So just match the very first
3888       * character, the '.' of the final term of the regex */
3889       locinput = starting + UTF8SKIP(starting);
3890      } else {
3891
3892       /* Here is the beginning of a character that can have
3893       * an extender.  It is either a hangul syllable, or a
3894       * non-control */
3895       if (swash_fetch(PL_utf8_X_non_hangul,
3896           (U8*)locinput, utf8_target))
3897       {
3898
3899        /* Here not a Hangul syllable, must be a
3900        * ('!  * Control') */
3901        locinput += UTF8SKIP(locinput);
3902       } else {
3903
3904        /* Here is a Hangul syllable.  It can be composed
3905        * of several individual characters.  One
3906        * possibility is T+ */
3907        if (swash_fetch(PL_utf8_X_T,
3908            (U8*)locinput, utf8_target))
3909        {
3910         while (locinput < PL_regeol
3911           && swash_fetch(PL_utf8_X_T,
3912               (U8*)locinput, utf8_target))
3913         {
3914          locinput += UTF8SKIP(locinput);
3915         }
3916        } else {
3917
3918         /* Here, not T+, but is a Hangul.  That means
3919         * it is one of the others: L, LV, LVT or V,
3920         * and matches:
3921         * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3922
3923         /* Match L*           */
3924         while (locinput < PL_regeol
3925           && swash_fetch(PL_utf8_X_L,
3926               (U8*)locinput, utf8_target))
3927         {
3928          locinput += UTF8SKIP(locinput);
3929         }
3930
3931         /* Here, have exhausted L*.  If the next
3932         * character is not an LV, LVT nor V, it means
3933         * we had to have at least one L, so matches L+
3934         * in the original equation, we have a complete
3935         * hangul syllable.  Are done. */
3936
3937         if (locinput < PL_regeol
3938          && swash_fetch(PL_utf8_X_LV_LVT_V,
3939              (U8*)locinput, utf8_target))
3940         {
3941
3942          /* Otherwise keep going.  Must be LV, LVT
3943          * or V.  See if LVT */
3944          if (swash_fetch(PL_utf8_X_LVT,
3945              (U8*)locinput, utf8_target))
3946          {
3947           locinput += UTF8SKIP(locinput);
3948          } else {
3949
3950           /* Must be  V or LV.  Take it, then
3951           * match V*     */
3952           locinput += UTF8SKIP(locinput);
3953           while (locinput < PL_regeol
3954             && swash_fetch(PL_utf8_X_V,
3955               (U8*)locinput, utf8_target))
3956           {
3957            locinput += UTF8SKIP(locinput);
3958           }
3959          }
3960
3961          /* And any of LV, LVT, or V can be followed
3962          * by T*            */
3963          while (locinput < PL_regeol
3964           && swash_fetch(PL_utf8_X_T,
3965               (U8*)locinput,
3966               utf8_target))
3967          {
3968           locinput += UTF8SKIP(locinput);
3969          }
3970         }
3971        }
3972       }
3973
3974       /* Match any extender */
3975       while (locinput < PL_regeol
3976         && swash_fetch(PL_utf8_X_extend,
3977             (U8*)locinput, utf8_target))
3978       {
3979        locinput += UTF8SKIP(locinput);
3980       }
3981      }
3982     }
3983     if (locinput > PL_regeol) sayNO;
3984    }
3985    nextchr = UCHARAT(locinput);
3986    break;
3987
3988   case NREFFL:
3989   {   /* The capture buffer cases.  The ones beginning with N for the
3990    named buffers just convert to the equivalent numbered and
3991    pretend they were called as the corresponding numbered buffer
3992    op.  */
3993    /* don't initialize these in the declaration, it makes C++
3994    unhappy */
3995    char *s;
3996    char type;
3997    re_fold_t folder;
3998    const U8 *fold_array;
3999    UV utf8_fold_flags;
4000
4001    PL_reg_flags |= RF_tainted;
4002    folder = foldEQ_locale;
4003    fold_array = PL_fold_locale;
4004    type = REFFL;
4005    utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4006    goto do_nref;
4007
4008   case NREFFA:
4009    folder = foldEQ_latin1;
4010    fold_array = PL_fold_latin1;
4011    type = REFFA;
4012    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4013    goto do_nref;
4014
4015   case NREFFU:
4016    folder = foldEQ_latin1;
4017    fold_array = PL_fold_latin1;
4018    type = REFFU;
4019    utf8_fold_flags = 0;
4020    goto do_nref;
4021
4022   case NREFF:
4023    folder = foldEQ;
4024    fold_array = PL_fold;
4025    type = REFF;
4026    utf8_fold_flags = 0;
4027    goto do_nref;
4028
4029   case NREF:
4030    type = REF;
4031    folder = NULL;
4032    fold_array = NULL;
4033    utf8_fold_flags = 0;
4034   do_nref:
4035
4036    /* For the named back references, find the corresponding buffer
4037    * number */
4038    n = reg_check_named_buff_matched(rex,scan);
4039
4040    if ( ! n ) {
4041     sayNO;
4042    }
4043    goto do_nref_ref_common;
4044
4045   case REFFL:
4046    PL_reg_flags |= RF_tainted;
4047    folder = foldEQ_locale;
4048    fold_array = PL_fold_locale;
4049    utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4050    goto do_ref;
4051
4052   case REFFA:
4053    folder = foldEQ_latin1;
4054    fold_array = PL_fold_latin1;
4055    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4056    goto do_ref;
4057
4058   case REFFU:
4059    folder = foldEQ_latin1;
4060    fold_array = PL_fold_latin1;
4061    utf8_fold_flags = 0;
4062    goto do_ref;
4063
4064   case REFF:
4065    folder = foldEQ;
4066    fold_array = PL_fold;
4067    utf8_fold_flags = 0;
4068    goto do_ref;
4069
4070   case REF:
4071    folder = NULL;
4072    fold_array = NULL;
4073    utf8_fold_flags = 0;
4074
4075   do_ref:
4076    type = OP(scan);
4077    n = ARG(scan);  /* which paren pair */
4078
4079   do_nref_ref_common:
4080    ln = PL_regoffs[n].start;
4081    PL_reg_leftiter = PL_reg_maxiter;  /* Void cache */
4082    if (*PL_reglastparen < n || ln == -1)
4083     sayNO;   /* Do not match unless seen CLOSEn. */
4084    if (ln == PL_regoffs[n].end)
4085     break;
4086
4087    s = PL_bostr + ln;
4088    if (type != REF /* REF can do byte comparison */
4089     && (utf8_target || type == REFFU))
4090    { /* XXX handle REFFL better */
4091     char * limit = PL_regeol;
4092
4093     /* This call case insensitively compares the entire buffer
4094      * at s, with the current input starting at locinput, but
4095      * not going off the end given by PL_regeol, and returns in
4096      * limit upon success, how much of the current input was
4097      * matched */
4098     if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4099          locinput, &limit, 0, utf8_target, utf8_fold_flags))
4100     {
4101      sayNO;
4102     }
4103     locinput = limit;
4104     nextchr = UCHARAT(locinput);
4105     break;
4106    }
4107
4108    /* Not utf8:  Inline the first character, for speed. */
4109    if (UCHARAT(s) != nextchr &&
4110     (type == REF ||
4111     UCHARAT(s) != fold_array[nextchr]))
4112     sayNO;
4113    ln = PL_regoffs[n].end - ln;
4114    if (locinput + ln > PL_regeol)
4115     sayNO;
4116    if (ln > 1 && (type == REF
4117       ? memNE(s, locinput, ln)
4118       : ! folder(s, locinput, ln)))
4119     sayNO;
4120    locinput += ln;
4121    nextchr = UCHARAT(locinput);
4122    break;
4123   }
4124   case NOTHING:
4125   case TAIL:
4126    break;
4127   case BACK:
4128    break;
4129
4130 #undef  ST
4131 #define ST st->u.eval
4132   {
4133    SV *ret;
4134    REGEXP *re_sv;
4135    regexp *re;
4136    regexp_internal *rei;
4137    regnode *startpoint;
4138
4139   case GOSTART:
4140   case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4141    if (cur_eval && cur_eval->locinput==locinput) {
4142     if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4143      Perl_croak(aTHX_ "Infinite recursion in regex");
4144     if ( ++nochange_depth > max_nochange_depth )
4145      Perl_croak(aTHX_
4146       "Pattern subroutine nesting without pos change"
4147       " exceeded limit in regex");
4148    } else {
4149     nochange_depth = 0;
4150    }
4151    re_sv = rex_sv;
4152    re = rex;
4153    rei = rexi;
4154    (void)ReREFCNT_inc(rex_sv);
4155    if (OP(scan)==GOSUB) {
4156     startpoint = scan + ARG2L(scan);
4157     ST.close_paren = ARG(scan);
4158    } else {
4159     startpoint = rei->program+1;
4160     ST.close_paren = 0;
4161    }
4162    goto eval_recurse_doit;
4163    /* NOTREACHED */
4164   case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4165    if (cur_eval && cur_eval->locinput==locinput) {
4166     if ( ++nochange_depth > max_nochange_depth )
4167      Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4168    } else {
4169     nochange_depth = 0;
4170    }
4171    {
4172     /* execute the code in the {...} */
4173     dSP;
4174     SV ** const before = SP;
4175     OP_4tree * const oop = PL_op;
4176     COP * const ocurcop = PL_curcop;
4177     PAD *old_comppad;
4178     char *saved_regeol = PL_regeol;
4179     struct re_save_state saved_state;
4180
4181     /* To not corrupt the existing regex state while executing the
4182     * eval we would normally put it on the save stack, like with
4183     * save_re_context. However, re-evals have a weird scoping so we
4184     * can't just add ENTER/LEAVE here. With that, things like
4185     *
4186     *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4187     *
4188     * would break, as they expect the localisation to be unwound
4189     * only when the re-engine backtracks through the bit that
4190     * localised it.
4191     *
4192     * What we do instead is just saving the state in a local c
4193     * variable.
4194     */
4195     Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4196
4197     n = ARG(scan);
4198     PL_op = (OP_4tree*)rexi->data->data[n];
4199     DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4200      "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4201     PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4202     PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4203
4204     if (sv_yes_mark) {
4205      SV *sv_mrk = get_sv("REGMARK", 1);
4206      sv_setsv(sv_mrk, sv_yes_mark);
4207     }
4208
4209     CALLRUNOPS(aTHX);   /* Scalar context. */
4210     SPAGAIN;
4211     if (SP == before)
4212      ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4213     else {
4214      ret = POPs;
4215      PUTBACK;
4216     }
4217
4218     Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4219
4220     PL_op = oop;
4221     PAD_RESTORE_LOCAL(old_comppad);
4222     PL_curcop = ocurcop;
4223     PL_regeol = saved_regeol;
4224     if (!logical) {
4225      /* /(?{...})/ */
4226      sv_setsv(save_scalar(PL_replgv), ret);
4227      break;
4228     }
4229    }
4230    if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4231     logical = 0;
4232     {
4233      /* extract RE object from returned value; compiling if
4234      * necessary */
4235      MAGIC *mg = NULL;
4236      REGEXP *rx = NULL;
4237
4238      if (SvROK(ret)) {
4239       SV *const sv = SvRV(ret);
4240
4241       if (SvTYPE(sv) == SVt_REGEXP) {
4242        rx = (REGEXP*) sv;
4243       } else if (SvSMAGICAL(sv)) {
4244        mg = mg_find(sv, PERL_MAGIC_qr);
4245        assert(mg);
4246       }
4247      } else if (SvTYPE(ret) == SVt_REGEXP) {
4248       rx = (REGEXP*) ret;
4249      } else if (SvSMAGICAL(ret)) {
4250       if (SvGMAGICAL(ret)) {
4251        /* I don't believe that there is ever qr magic
4252        here.  */
4253        assert(!mg_find(ret, PERL_MAGIC_qr));
4254        sv_unmagic(ret, PERL_MAGIC_qr);
4255       }
4256       else {
4257        mg = mg_find(ret, PERL_MAGIC_qr);
4258        /* testing suggests mg only ends up non-NULL for
4259        scalars who were upgraded and compiled in the
4260        else block below. In turn, this is only
4261        triggered in the "postponed utf8 string" tests
4262        in t/op/pat.t  */
4263       }
4264      }
4265
4266      if (mg) {
4267       rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4268       assert(rx);
4269      }
4270      if (rx) {
4271       rx = reg_temp_copy(NULL, rx);
4272      }
4273      else {
4274       U32 pm_flags = 0;
4275       const I32 osize = PL_regsize;
4276
4277       if (DO_UTF8(ret)) {
4278        assert (SvUTF8(ret));
4279       } else if (SvUTF8(ret)) {
4280        /* Not doing UTF-8, despite what the SV says. Is
4281        this only if we're trapped in use 'bytes'?  */
4282        /* Make a copy of the octet sequence, but without
4283        the flag on, as the compiler now honours the
4284        SvUTF8 flag on ret.  */
4285        STRLEN len;
4286        const char *const p = SvPV(ret, len);
4287        ret = newSVpvn_flags(p, len, SVs_TEMP);
4288       }
4289       rx = CALLREGCOMP(ret, pm_flags);
4290       if (!(SvFLAGS(ret)
4291        & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4292         | SVs_GMG))) {
4293        /* This isn't a first class regexp. Instead, it's
4294        caching a regexp onto an existing, Perl visible
4295        scalar.  */
4296        sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4297       }
4298       PL_regsize = osize;
4299      }
4300      re_sv = rx;
4301      re = (struct regexp *)SvANY(rx);
4302     }
4303     RXp_MATCH_COPIED_off(re);
4304     re->subbeg = rex->subbeg;
4305     re->sublen = rex->sublen;
4306     rei = RXi_GET(re);
4307     DEBUG_EXECUTE_r(
4308      debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4309       "Matching embedded");
4310     );
4311     startpoint = rei->program + 1;
4312      ST.close_paren = 0; /* only used for GOSUB */
4313      /* borrowed from regtry */
4314     if (PL_reg_start_tmpl <= re->nparens) {
4315      PL_reg_start_tmpl = re->nparens*3/2 + 3;
4316      if(PL_reg_start_tmp)
4317       Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4318      else
4319       Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4320     }
4321
4322   eval_recurse_doit: /* Share code with GOSUB below this line */
4323     /* run the pattern returned from (??{...}) */
4324     ST.cp = regcppush(0); /* Save *all* the positions. */
4325     REGCP_SET(ST.lastcp);
4326
4327     PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4328
4329     /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4330     PL_reglastparen = &re->lastparen;
4331     PL_reglastcloseparen = &re->lastcloseparen;
4332     re->lastparen = 0;
4333     re->lastcloseparen = 0;
4334
4335     PL_reginput = locinput;
4336     PL_regsize = 0;
4337
4338     /* XXXX This is too dramatic a measure... */
4339     PL_reg_maxiter = 0;
4340
4341     ST.toggle_reg_flags = PL_reg_flags;
4342     if (RX_UTF8(re_sv))
4343      PL_reg_flags |= RF_utf8;
4344     else
4345      PL_reg_flags &= ~RF_utf8;
4346     ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4347
4348     ST.prev_rex = rex_sv;
4349     ST.prev_curlyx = cur_curlyx;
4350     SETREX(rex_sv,re_sv);
4351     rex = re;
4352     rexi = rei;
4353     cur_curlyx = NULL;
4354     ST.B = next;
4355     ST.prev_eval = cur_eval;
4356     cur_eval = st;
4357     /* now continue from first node in postoned RE */
4358     PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4359     /* NOTREACHED */
4360    }
4361    /* logical is 1,   /(?(?{...})X|Y)/ */
4362    sw = cBOOL(SvTRUE(ret));
4363    logical = 0;
4364    break;
4365   }
4366
4367   case EVAL_AB: /* cleanup after a successful (??{A})B */
4368    /* note: this is called twice; first after popping B, then A */
4369    PL_reg_flags ^= ST.toggle_reg_flags;
4370    ReREFCNT_dec(rex_sv);
4371    SETREX(rex_sv,ST.prev_rex);
4372    rex = (struct regexp *)SvANY(rex_sv);
4373    rexi = RXi_GET(rex);
4374    regcpblow(ST.cp);
4375    cur_eval = ST.prev_eval;
4376    cur_curlyx = ST.prev_curlyx;
4377
4378    /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4379    PL_reglastparen = &rex->lastparen;
4380    PL_reglastcloseparen = &rex->lastcloseparen;
4381    /* also update PL_regoffs */
4382    PL_regoffs = rex->offs;
4383
4384    /* XXXX This is too dramatic a measure... */
4385    PL_reg_maxiter = 0;
4386    if ( nochange_depth )
4387     nochange_depth--;
4388    sayYES;
4389
4390
4391   case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4392    /* note: this is called twice; first after popping B, then A */
4393    PL_reg_flags ^= ST.toggle_reg_flags;
4394    ReREFCNT_dec(rex_sv);
4395    SETREX(rex_sv,ST.prev_rex);
4396    rex = (struct regexp *)SvANY(rex_sv);
4397    rexi = RXi_GET(rex);
4398    /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4399    PL_reglastparen = &rex->lastparen;
4400    PL_reglastcloseparen = &rex->lastcloseparen;
4401
4402    PL_reginput = locinput;
4403    REGCP_UNWIND(ST.lastcp);
4404    regcppop(rex);
4405    cur_eval = ST.prev_eval;
4406    cur_curlyx = ST.prev_curlyx;
4407    /* XXXX This is too dramatic a measure... */
4408    PL_reg_maxiter = 0;
4409    if ( nochange_depth )
4410     nochange_depth--;
4411    sayNO_SILENT;
4412 #undef ST
4413
4414   case OPEN:
4415    n = ARG(scan);  /* which paren pair */
4416    PL_reg_start_tmp[n] = locinput;
4417    if (n > PL_regsize)
4418     PL_regsize = n;
4419    lastopen = n;
4420    break;
4421   case CLOSE:
4422    n = ARG(scan);  /* which paren pair */
4423    PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4424    PL_regoffs[n].end = locinput - PL_bostr;
4425    /*if (n > PL_regsize)
4426     PL_regsize = n;*/
4427    if (n > *PL_reglastparen)
4428     *PL_reglastparen = n;
4429    *PL_reglastcloseparen = n;
4430    if (cur_eval && cur_eval->u.eval.close_paren == n) {
4431     goto fake_end;
4432    }
4433    break;
4434   case ACCEPT:
4435    if (ARG(scan)){
4436     regnode *cursor;
4437     for (cursor=scan;
4438      cursor && OP(cursor)!=END;
4439      cursor=regnext(cursor))
4440     {
4441      if ( OP(cursor)==CLOSE ){
4442       n = ARG(cursor);
4443       if ( n <= lastopen ) {
4444        PL_regoffs[n].start
4445         = PL_reg_start_tmp[n] - PL_bostr;
4446        PL_regoffs[n].end = locinput - PL_bostr;
4447        /*if (n > PL_regsize)
4448        PL_regsize = n;*/
4449        if (n > *PL_reglastparen)
4450         *PL_reglastparen = n;
4451        *PL_reglastcloseparen = n;
4452        if ( n == ARG(scan) || (cur_eval &&
4453         cur_eval->u.eval.close_paren == n))
4454         break;
4455       }
4456      }
4457     }
4458    }
4459    goto fake_end;
4460    /*NOTREACHED*/
4461   case GROUPP:
4462    n = ARG(scan);  /* which paren pair */
4463    sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4464    break;
4465   case NGROUPP:
4466    /* reg_check_named_buff_matched returns 0 for no match */
4467    sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4468    break;
4469   case INSUBP:
4470    n = ARG(scan);
4471    sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4472    break;
4473   case DEFINEP:
4474    sw = 0;
4475    break;
4476   case IFTHEN:
4477    PL_reg_leftiter = PL_reg_maxiter;  /* Void cache */
4478    if (sw)
4479     next = NEXTOPER(NEXTOPER(scan));
4480    else {
4481     next = scan + ARG(scan);
4482     if (OP(next) == IFTHEN) /* Fake one. */
4483      next = NEXTOPER(NEXTOPER(next));
4484    }
4485    break;
4486   case LOGICAL:
4487    logical = scan->flags;
4488    break;
4489
4490 /*******************************************************************
4491
4492 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4493 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4494 STAR/PLUS/CURLY/CURLYN are used instead.)
4495
4496 A*B is compiled as <CURLYX><A><WHILEM><B>
4497
4498 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4499 state, which contains the current count, initialised to -1. It also sets
4500 cur_curlyx to point to this state, with any previous value saved in the
4501 state block.
4502
4503 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4504 since the pattern may possibly match zero times (i.e. it's a while {} loop
4505 rather than a do {} while loop).
4506
4507 Each entry to WHILEM represents a successful match of A. The count in the
4508 CURLYX block is incremented, another WHILEM state is pushed, and execution
4509 passes to A or B depending on greediness and the current count.
4510
4511 For example, if matching against the string a1a2a3b (where the aN are
4512 substrings that match /A/), then the match progresses as follows: (the
4513 pushed states are interspersed with the bits of strings matched so far):
4514
4515  <CURLYX cnt=-1>
4516  <CURLYX cnt=0><WHILEM>
4517  <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4518  <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4519  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4520  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4521
4522 (Contrast this with something like CURLYM, which maintains only a single
4523 backtrack state:
4524
4525  <CURLYM cnt=0> a1
4526  a1 <CURLYM cnt=1> a2
4527  a1 a2 <CURLYM cnt=2> a3
4528  a1 a2 a3 <CURLYM cnt=3> b
4529 )
4530
4531 Each WHILEM state block marks a point to backtrack to upon partial failure
4532 of A or B, and also contains some minor state data related to that
4533 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4534 overall state, such as the count, and pointers to the A and B ops.
4535
4536 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4537 must always point to the *current* CURLYX block, the rules are:
4538
4539 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4540 and set cur_curlyx to point the new block.
4541
4542 When popping the CURLYX block after a successful or unsuccessful match,
4543 restore the previous cur_curlyx.
4544
4545 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4546 to the outer one saved in the CURLYX block.
4547
4548 When popping the WHILEM block after a successful or unsuccessful B match,
4549 restore the previous cur_curlyx.
4550
4551 Here's an example for the pattern (AI* BI)*BO
4552 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4553
4554 cur_
4555 curlyx backtrack stack
4556 ------ ---------------
4557 NULL
4558 CO     <CO prev=NULL> <WO>
4559 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4560 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4561 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4562
4563 At this point the pattern succeeds, and we work back down the stack to
4564 clean up, restoring as we go:
4565
4566 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4567 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4568 CO     <CO prev=NULL> <WO>
4569 NULL
4570
4571 *******************************************************************/
4572
4573 #define ST st->u.curlyx
4574
4575   case CURLYX:    /* start of /A*B/  (for complex A) */
4576   {
4577    /* No need to save/restore up to this paren */
4578    I32 parenfloor = scan->flags;
4579
4580    assert(next); /* keep Coverity happy */
4581    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4582     next += ARG(next);
4583
4584    /* XXXX Probably it is better to teach regpush to support
4585    parenfloor > PL_regsize... */
4586    if (parenfloor > (I32)*PL_reglastparen)
4587     parenfloor = *PL_reglastparen; /* Pessimization... */
4588
4589    ST.prev_curlyx= cur_curlyx;
4590    cur_curlyx = st;
4591    ST.cp = PL_savestack_ix;
4592
4593    /* these fields contain the state of the current curly.
4594    * they are accessed by subsequent WHILEMs */
4595    ST.parenfloor = parenfloor;
4596    ST.me = scan;
4597    ST.B = next;
4598    ST.minmod = minmod;
4599    minmod = 0;
4600    ST.count = -1; /* this will be updated by WHILEM */
4601    ST.lastloc = NULL;  /* this will be updated by WHILEM */
4602
4603    PL_reginput = locinput;
4604    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4605    /* NOTREACHED */
4606   }
4607
4608   case CURLYX_end: /* just finished matching all of A*B */
4609    cur_curlyx = ST.prev_curlyx;
4610    sayYES;
4611    /* NOTREACHED */
4612
4613   case CURLYX_end_fail: /* just failed to match all of A*B */
4614    regcpblow(ST.cp);
4615    cur_curlyx = ST.prev_curlyx;
4616    sayNO;
4617    /* NOTREACHED */
4618
4619
4620 #undef ST
4621 #define ST st->u.whilem
4622
4623   case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4624   {
4625    /* see the discussion above about CURLYX/WHILEM */
4626    I32 n;
4627    int min = ARG1(cur_curlyx->u.curlyx.me);
4628    int max = ARG2(cur_curlyx->u.curlyx.me);
4629    regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4630
4631    assert(cur_curlyx); /* keep Coverity happy */
4632    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4633    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4634    ST.cache_offset = 0;
4635    ST.cache_mask = 0;
4636
4637    PL_reginput = locinput;
4638
4639    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4640     "%*s  whilem: matched %ld out of %d..%d\n",
4641     REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4642    );
4643
4644    /* First just match a string of min A's. */
4645
4646    if (n < min) {
4647     ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4648     cur_curlyx->u.curlyx.lastloc = locinput;
4649     REGCP_SET(ST.lastcp);
4650
4651     PUSH_STATE_GOTO(WHILEM_A_pre, A);
4652     /* NOTREACHED */
4653    }
4654
4655    /* If degenerate A matches "", assume A done. */
4656
4657    if (locinput == cur_curlyx->u.curlyx.lastloc) {
4658     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4659     "%*s  whilem: empty match detected, trying continuation...\n",
4660     REPORT_CODE_OFF+depth*2, "")
4661     );
4662     goto do_whilem_B_max;
4663    }
4664
4665    /* super-linear cache processing */
4666
4667    if (scan->flags) {
4668
4669     if (!PL_reg_maxiter) {
4670      /* start the countdown: Postpone detection until we
4671      * know the match is not *that* much linear. */
4672      PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4673      /* possible overflow for long strings and many CURLYX's */
4674      if (PL_reg_maxiter < 0)
4675       PL_reg_maxiter = I32_MAX;
4676      PL_reg_leftiter = PL_reg_maxiter;
4677     }
4678
4679     if (PL_reg_leftiter-- == 0) {
4680      /* initialise cache */
4681      const I32 size = (PL_reg_maxiter + 7)/8;
4682      if (PL_reg_poscache) {
4683       if ((I32)PL_reg_poscache_size < size) {
4684        Renew(PL_reg_poscache, size, char);
4685        PL_reg_poscache_size = size;
4686       }
4687       Zero(PL_reg_poscache, size, char);
4688      }
4689      else {
4690       PL_reg_poscache_size = size;
4691       Newxz(PL_reg_poscache, size, char);
4692      }
4693      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4694  "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4695        PL_colors[4], PL_colors[5])
4696      );
4697     }
4698
4699     if (PL_reg_leftiter < 0) {
4700      /* have we already failed at this position? */
4701      I32 offset, mask;
4702      offset  = (scan->flags & 0xf) - 1
4703         + (locinput - PL_bostr)  * (scan->flags>>4);
4704      mask    = 1 << (offset % 8);
4705      offset /= 8;
4706      if (PL_reg_poscache[offset] & mask) {
4707       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4708        "%*s  whilem: (cache) already tried at this position...\n",
4709        REPORT_CODE_OFF+depth*2, "")
4710       );
4711       sayNO; /* cache records failure */
4712      }
4713      ST.cache_offset = offset;
4714      ST.cache_mask   = mask;
4715     }
4716    }
4717
4718    /* Prefer B over A for minimal matching. */
4719
4720    if (cur_curlyx->u.curlyx.minmod) {
4721     ST.save_curlyx = cur_curlyx;
4722     cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4723     ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4724     REGCP_SET(ST.lastcp);
4725     PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4726     /* NOTREACHED */
4727    }
4728
4729    /* Prefer A over B for maximal matching. */
4730
4731    if (n < max) { /* More greed allowed? */
4732     ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4733     cur_curlyx->u.curlyx.lastloc = locinput;
4734     REGCP_SET(ST.lastcp);
4735     PUSH_STATE_GOTO(WHILEM_A_max, A);
4736     /* NOTREACHED */
4737    }
4738    goto do_whilem_B_max;
4739   }
4740   /* NOTREACHED */
4741
4742   case WHILEM_B_min: /* just matched B in a minimal match */
4743   case WHILEM_B_max: /* just matched B in a maximal match */
4744    cur_curlyx = ST.save_curlyx;
4745    sayYES;
4746    /* NOTREACHED */
4747
4748   case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4749    cur_curlyx = ST.save_curlyx;
4750    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4751    cur_curlyx->u.curlyx.count--;
4752    CACHEsayNO;
4753    /* NOTREACHED */
4754
4755   case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4756    /* FALL THROUGH */
4757   case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4758    REGCP_UNWIND(ST.lastcp);
4759    regcppop(rex);
4760    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4761    cur_curlyx->u.curlyx.count--;
4762    CACHEsayNO;
4763    /* NOTREACHED */
4764
4765   case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4766    REGCP_UNWIND(ST.lastcp);
4767    regcppop(rex); /* Restore some previous $<digit>s? */
4768    PL_reginput = locinput;
4769    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4770     "%*s  whilem: failed, trying continuation...\n",
4771     REPORT_CODE_OFF+depth*2, "")
4772    );
4773   do_whilem_B_max:
4774    if (cur_curlyx->u.curlyx.count >= REG_INFTY
4775     && ckWARN(WARN_REGEXP)
4776     && !(PL_reg_flags & RF_warned))
4777    {
4778     PL_reg_flags |= RF_warned;
4779     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4780      "Complex regular subexpression recursion",
4781      REG_INFTY - 1);
4782    }
4783
4784    /* now try B */
4785    ST.save_curlyx = cur_curlyx;
4786    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4787    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4788    /* NOTREACHED */
4789
4790   case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4791    cur_curlyx = ST.save_curlyx;
4792    REGCP_UNWIND(ST.lastcp);
4793    regcppop(rex);
4794
4795    if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4796     /* Maximum greed exceeded */
4797     if (cur_curlyx->u.curlyx.count >= REG_INFTY
4798      && ckWARN(WARN_REGEXP)
4799      && !(PL_reg_flags & RF_warned))
4800     {
4801      PL_reg_flags |= RF_warned;
4802      Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4803       "%s limit (%d) exceeded",
4804       "Complex regular subexpression recursion",
4805       REG_INFTY - 1);
4806     }
4807     cur_curlyx->u.curlyx.count--;
4808     CACHEsayNO;
4809    }
4810
4811    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4812     "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4813    );
4814    /* Try grabbing another A and see if it helps. */
4815    PL_reginput = locinput;
4816    cur_curlyx->u.curlyx.lastloc = locinput;
4817    ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4818    REGCP_SET(ST.lastcp);
4819    PUSH_STATE_GOTO(WHILEM_A_min,
4820     /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4821    /* NOTREACHED */
4822
4823 #undef  ST
4824 #define ST st->u.branch
4825
4826   case BRANCHJ:     /*  /(...|A|...)/ with long next pointer */
4827    next = scan + ARG(scan);
4828    if (next == scan)
4829     next = NULL;
4830    scan = NEXTOPER(scan);
4831    /* FALL THROUGH */
4832
4833   case BRANCH:     /*  /(...|A|...)/ */
4834    scan = NEXTOPER(scan); /* scan now points to inner node */
4835    ST.lastparen = *PL_reglastparen;
4836    ST.next_branch = next;
4837    REGCP_SET(ST.cp);
4838    PL_reginput = locinput;
4839
4840    /* Now go into the branch */
4841    if (has_cutgroup) {
4842     PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4843    } else {
4844     PUSH_STATE_GOTO(BRANCH_next, scan);
4845    }
4846    /* NOTREACHED */
4847   case CUTGROUP:
4848    PL_reginput = locinput;
4849    sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4850     MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4851    PUSH_STATE_GOTO(CUTGROUP_next,next);
4852    /* NOTREACHED */
4853   case CUTGROUP_next_fail:
4854    do_cutgroup = 1;
4855    no_final = 1;
4856    if (st->u.mark.mark_name)
4857     sv_commit = st->u.mark.mark_name;
4858    sayNO;
4859    /* NOTREACHED */
4860   case BRANCH_next:
4861    sayYES;
4862    /* NOTREACHED */
4863   case BRANCH_next_fail: /* that branch failed; try the next, if any */
4864    if (do_cutgroup) {
4865     do_cutgroup = 0;
4866     no_final = 0;
4867    }
4868    REGCP_UNWIND(ST.cp);
4869    for (n = *PL_reglastparen; n > ST.lastparen; n--)
4870     PL_regoffs[n].end = -1;
4871    *PL_reglastparen = n;
4872    /*dmq: *PL_reglastcloseparen = n; */
4873    scan = ST.next_branch;
4874    /* no more branches? */
4875    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4876     DEBUG_EXECUTE_r({
4877      PerlIO_printf( Perl_debug_log,
4878       "%*s  %sBRANCH failed...%s\n",
4879       REPORT_CODE_OFF+depth*2, "",
4880       PL_colors[4],
4881       PL_colors[5] );
4882     });
4883     sayNO_SILENT;
4884    }
4885    continue; /* execute next BRANCH[J] op */
4886    /* NOTREACHED */
4887
4888   case MINMOD:
4889    minmod = 1;
4890    break;
4891
4892 #undef  ST
4893 #define ST st->u.curlym
4894
4895   case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4896
4897    /* This is an optimisation of CURLYX that enables us to push
4898    * only a single backtracking state, no matter how many matches
4899    * there are in {m,n}. It relies on the pattern being constant
4900    * length, with no parens to influence future backrefs
4901    */
4902
4903    ST.me = scan;
4904    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4905
4906    /* if paren positive, emulate an OPEN/CLOSE around A */
4907    if (ST.me->flags) {
4908     U32 paren = ST.me->flags;
4909     if (paren > PL_regsize)
4910      PL_regsize = paren;
4911     if (paren > *PL_reglastparen)
4912      *PL_reglastparen = paren;
4913     scan += NEXT_OFF(scan); /* Skip former OPEN. */
4914    }
4915    ST.A = scan;
4916    ST.B = next;
4917    ST.alen = 0;
4918    ST.count = 0;
4919    ST.minmod = minmod;
4920    minmod = 0;
4921    ST.c1 = CHRTEST_UNINIT;
4922    REGCP_SET(ST.cp);
4923
4924    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4925     goto curlym_do_B;
4926
4927   curlym_do_A: /* execute the A in /A{m,n}B/  */
4928    PL_reginput = locinput;
4929    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4930    /* NOTREACHED */
4931
4932   case CURLYM_A: /* we've just matched an A */
4933    locinput = st->locinput;
4934    nextchr = UCHARAT(locinput);
4935
4936    ST.count++;
4937    /* after first match, determine A's length: u.curlym.alen */
4938    if (ST.count == 1) {
4939     if (PL_reg_match_utf8) {
4940      char *s = locinput;
4941      while (s < PL_reginput) {
4942       ST.alen++;
4943       s += UTF8SKIP(s);
4944      }
4945     }
4946     else {
4947      ST.alen = PL_reginput - locinput;
4948     }
4949     if (ST.alen == 0)
4950      ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4951    }
4952    DEBUG_EXECUTE_r(
4953     PerlIO_printf(Perl_debug_log,
4954       "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4955       (int)(REPORT_CODE_OFF+(depth*2)), "",
4956       (IV) ST.count, (IV)ST.alen)
4957    );
4958
4959    locinput = PL_reginput;
4960
4961    if (cur_eval && cur_eval->u.eval.close_paren &&
4962     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4963     goto fake_end;
4964
4965    {
4966     I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4967     if ( max == REG_INFTY || ST.count < max )
4968      goto curlym_do_A; /* try to match another A */
4969    }
4970    goto curlym_do_B; /* try to match B */
4971
4972   case CURLYM_A_fail: /* just failed to match an A */
4973    REGCP_UNWIND(ST.cp);
4974
4975    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4976     || (cur_eval && cur_eval->u.eval.close_paren &&
4977      cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4978     sayNO;
4979
4980   curlym_do_B: /* execute the B in /A{m,n}B/  */
4981    PL_reginput = locinput;
4982    if (ST.c1 == CHRTEST_UNINIT) {
4983     /* calculate c1 and c2 for possible match of 1st char
4984     * following curly */
4985     ST.c1 = ST.c2 = CHRTEST_VOID;
4986     if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4987      regnode *text_node = ST.B;
4988      if (! HAS_TEXT(text_node))
4989       FIND_NEXT_IMPT(text_node);
4990      /* this used to be
4991
4992       (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4993
4994        But the former is redundant in light of the latter.
4995
4996        if this changes back then the macro for
4997        IS_TEXT and friends need to change.
4998      */
4999      if (PL_regkind[OP(text_node)] == EXACT)
5000      {
5001
5002       ST.c1 = (U8)*STRING(text_node);
5003       switch (OP(text_node)) {
5004        case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5005        case EXACTFA:
5006        case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5007        case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5008        default: ST.c2 = ST.c1;
5009       }
5010      }
5011     }
5012    }
5013
5014    DEBUG_EXECUTE_r(
5015     PerlIO_printf(Perl_debug_log,
5016      "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5017      (int)(REPORT_CODE_OFF+(depth*2)),
5018      "", (IV)ST.count)
5019     );
5020    if (ST.c1 != CHRTEST_VOID
5021      && UCHARAT(PL_reginput) != ST.c1
5022      && UCHARAT(PL_reginput) != ST.c2)
5023    {
5024     /* simulate B failing */
5025     DEBUG_OPTIMISE_r(
5026      PerlIO_printf(Perl_debug_log,
5027       "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5028       (int)(REPORT_CODE_OFF+(depth*2)),"",
5029       (IV)ST.c1,(IV)ST.c2
5030     ));
5031     state_num = CURLYM_B_fail;
5032     goto reenter_switch;
5033    }
5034
5035    if (ST.me->flags) {
5036     /* mark current A as captured */
5037     I32 paren = ST.me->flags;
5038     if (ST.count) {
5039      PL_regoffs[paren].start
5040       = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5041      PL_regoffs[paren].end = PL_reginput - PL_bostr;
5042      /*dmq: *PL_reglastcloseparen = paren; */
5043     }
5044     else
5045      PL_regoffs[paren].end = -1;
5046     if (cur_eval && cur_eval->u.eval.close_paren &&
5047      cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5048     {
5049      if (ST.count)
5050       goto fake_end;
5051      else
5052       sayNO;
5053     }
5054    }
5055
5056    PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5057    /* NOTREACHED */
5058
5059   case CURLYM_B_fail: /* just failed to match a B */
5060    REGCP_UNWIND(ST.cp);
5061    if (ST.minmod) {
5062     I32 max = ARG2(ST.me);
5063     if (max != REG_INFTY && ST.count == max)
5064      sayNO;
5065     goto curlym_do_A; /* try to match a further A */
5066    }
5067    /* backtrack one A */
5068    if (ST.count == ARG1(ST.me) /* min */)
5069     sayNO;
5070    ST.count--;
5071    locinput = HOPc(locinput, -ST.alen);
5072    goto curlym_do_B; /* try to match B */
5073
5074 #undef ST
5075 #define ST st->u.curly
5076
5077 #define CURLY_SETPAREN(paren, success) \
5078  if (paren) { \
5079   if (success) { \
5080    PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5081    PL_regoffs[paren].end = locinput - PL_bostr; \
5082    *PL_reglastcloseparen = paren; \
5083   } \
5084   else \
5085    PL_regoffs[paren].end = -1; \
5086  }
5087
5088   case STAR:  /*  /A*B/ where A is width 1 */
5089    ST.paren = 0;
5090    ST.min = 0;
5091    ST.max = REG_INFTY;
5092    scan = NEXTOPER(scan);
5093    goto repeat;
5094   case PLUS:  /*  /A+B/ where A is width 1 */
5095    ST.paren = 0;
5096    ST.min = 1;
5097    ST.max = REG_INFTY;
5098    scan = NEXTOPER(scan);
5099    goto repeat;
5100   case CURLYN:  /*  /(A){m,n}B/ where A is width 1 */
5101    ST.paren = scan->flags; /* Which paren to set */
5102    if (ST.paren > PL_regsize)
5103     PL_regsize = ST.paren;
5104    if (ST.paren > *PL_reglastparen)
5105     *PL_reglastparen = ST.paren;
5106    ST.min = ARG1(scan);  /* min to match */
5107    ST.max = ARG2(scan);  /* max to match */
5108    if (cur_eval && cur_eval->u.eval.close_paren &&
5109     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5110     ST.min=1;
5111     ST.max=1;
5112    }
5113    scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5114    goto repeat;
5115   case CURLY:  /*  /A{m,n}B/ where A is width 1 */
5116    ST.paren = 0;
5117    ST.min = ARG1(scan);  /* min to match */
5118    ST.max = ARG2(scan);  /* max to match */
5119    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5120   repeat:
5121    /*
5122    * Lookahead to avoid useless match attempts
5123    * when we know what character comes next.
5124    *
5125    * Used to only do .*x and .*?x, but now it allows
5126    * for )'s, ('s and (?{ ... })'s to be in the way
5127    * of the quantifier and the EXACT-like node.  -- japhy
5128    */
5129
5130    if (ST.min > ST.max) /* XXX make this a compile-time check? */
5131     sayNO;
5132    if (HAS_TEXT(next) || JUMPABLE(next)) {
5133     U8 *s;
5134     regnode *text_node = next;
5135
5136     if (! HAS_TEXT(text_node))
5137      FIND_NEXT_IMPT(text_node);
5138
5139     if (! HAS_TEXT(text_node))
5140      ST.c1 = ST.c2 = CHRTEST_VOID;
5141     else {
5142      if ( PL_regkind[OP(text_node)] != EXACT ) {
5143       ST.c1 = ST.c2 = CHRTEST_VOID;
5144       goto assume_ok_easy;
5145      }
5146      else
5147       s = (U8*)STRING(text_node);
5148
5149      /*  Currently we only get here when
5150
5151       PL_rekind[OP(text_node)] == EXACT
5152
5153       if this changes back then the macro for IS_TEXT and
5154       friends need to change. */
5155      if (!UTF_PATTERN) {
5156       ST.c1 = *s;
5157       switch (OP(text_node)) {
5158        case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5159        case EXACTFA:
5160        case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5161        case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5162        default: ST.c2 = ST.c1; break;
5163       }
5164      }
5165      else { /* UTF_PATTERN */
5166       if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5167        STRLEN ulen1, ulen2;
5168        U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5169        U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5170
5171        to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5172        to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5173 #ifdef EBCDIC
5174        ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5175              ckWARN(WARN_UTF8) ?
5176              0 : UTF8_ALLOW_ANY);
5177        ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5178              ckWARN(WARN_UTF8) ?
5179              0 : UTF8_ALLOW_ANY);
5180 #else
5181        ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5182              uniflags);
5183        ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5184              uniflags);
5185 #endif
5186       }
5187       else {
5188        ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5189              uniflags);
5190       }
5191      }
5192     }
5193    }
5194    else
5195     ST.c1 = ST.c2 = CHRTEST_VOID;
5196   assume_ok_easy:
5197
5198    ST.A = scan;
5199    ST.B = next;
5200    PL_reginput = locinput;
5201    if (minmod) {
5202     minmod = 0;
5203     if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5204      sayNO;
5205     ST.count = ST.min;
5206     locinput = PL_reginput;
5207     REGCP_SET(ST.cp);
5208     if (ST.c1 == CHRTEST_VOID)
5209      goto curly_try_B_min;
5210
5211     ST.oldloc = locinput;
5212
5213     /* set ST.maxpos to the furthest point along the
5214     * string that could possibly match */
5215     if  (ST.max == REG_INFTY) {
5216      ST.maxpos = PL_regeol - 1;
5217      if (utf8_target)
5218       while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5219        ST.maxpos--;
5220     }
5221     else if (utf8_target) {
5222      int m = ST.max - ST.min;
5223      for (ST.maxpos = locinput;
5224       m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5225       ST.maxpos += UTF8SKIP(ST.maxpos);
5226     }
5227     else {
5228      ST.maxpos = locinput + ST.max - ST.min;
5229      if (ST.maxpos >= PL_regeol)
5230       ST.maxpos = PL_regeol - 1;
5231     }
5232     goto curly_try_B_min_known;
5233
5234    }
5235    else {
5236     ST.count = regrepeat(rex, ST.A, ST.max, depth);
5237     locinput = PL_reginput;
5238     if (ST.count < ST.min)
5239      sayNO;
5240     if ((ST.count > ST.min)
5241      && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5242     {
5243      /* A{m,n} must come at the end of the string, there's
5244      * no point in backing off ... */
5245      ST.min = ST.count;
5246      /* ...except that $ and \Z can match before *and* after
5247      newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5248      We may back off by one in this case. */
5249      if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5250       ST.min--;
5251     }
5252     REGCP_SET(ST.cp);
5253     goto curly_try_B_max;
5254    }
5255    /* NOTREACHED */
5256
5257
5258   case CURLY_B_min_known_fail:
5259    /* failed to find B in a non-greedy match where c1,c2 valid */
5260    if (ST.paren && ST.count)
5261     PL_regoffs[ST.paren].end = -1;
5262
5263    PL_reginput = locinput; /* Could be reset... */
5264    REGCP_UNWIND(ST.cp);
5265    /* Couldn't or didn't -- move forward. */
5266    ST.oldloc = locinput;
5267    if (utf8_target)
5268     locinput += UTF8SKIP(locinput);
5269    else
5270     locinput++;
5271    ST.count++;
5272   curly_try_B_min_known:
5273    /* find the next place where 'B' could work, then call B */
5274    {
5275     int n;
5276     if (utf8_target) {
5277      n = (ST.oldloc == locinput) ? 0 : 1;
5278      if (ST.c1 == ST.c2) {
5279       STRLEN len;
5280       /* set n to utf8_distance(oldloc, locinput) */
5281       while (locinput <= ST.maxpos &&
5282        utf8n_to_uvchr((U8*)locinput,
5283            UTF8_MAXBYTES, &len,
5284            uniflags) != (UV)ST.c1) {
5285        locinput += len;
5286        n++;
5287       }
5288      }
5289      else {
5290       /* set n to utf8_distance(oldloc, locinput) */
5291       while (locinput <= ST.maxpos) {
5292        STRLEN len;
5293        const UV c = utf8n_to_uvchr((U8*)locinput,
5294             UTF8_MAXBYTES, &len,
5295             uniflags);
5296        if (c == (UV)ST.c1 || c == (UV)ST.c2)
5297         break;
5298        locinput += len;
5299        n++;
5300       }
5301      }
5302     }
5303     else {
5304      if (ST.c1 == ST.c2) {
5305       while (locinput <= ST.maxpos &&
5306        UCHARAT(locinput) != ST.c1)
5307        locinput++;
5308      }
5309      else {
5310       while (locinput <= ST.maxpos
5311        && UCHARAT(locinput) != ST.c1
5312        && UCHARAT(locinput) != ST.c2)
5313        locinput++;
5314      }
5315      n = locinput - ST.oldloc;
5316     }
5317     if (locinput > ST.maxpos)
5318      sayNO;
5319     /* PL_reginput == oldloc now */
5320     if (n) {
5321      ST.count += n;
5322      if (regrepeat(rex, ST.A, n, depth) < n)
5323       sayNO;
5324     }
5325     PL_reginput = locinput;
5326     CURLY_SETPAREN(ST.paren, ST.count);
5327     if (cur_eval && cur_eval->u.eval.close_paren &&
5328      cur_eval->u.eval.close_paren == (U32)ST.paren) {
5329      goto fake_end;
5330     }
5331     PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5332    }
5333    /* NOTREACHED */
5334
5335
5336   case CURLY_B_min_fail:
5337    /* failed to find B in a non-greedy match where c1,c2 invalid */
5338    if (ST.paren && ST.count)
5339     PL_regoffs[ST.paren].end = -1;
5340
5341    REGCP_UNWIND(ST.cp);
5342    /* failed -- move forward one */
5343    PL_reginput = locinput;
5344    if (regrepeat(rex, ST.A, 1, depth)) {
5345     ST.count++;
5346     locinput = PL_reginput;
5347     if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5348       ST.count > 0)) /* count overflow ? */
5349     {
5350     curly_try_B_min:
5351      CURLY_SETPAREN(ST.paren, ST.count);
5352      if (cur_eval && cur_eval->u.eval.close_paren &&
5353       cur_eval->u.eval.close_paren == (U32)ST.paren) {
5354       goto fake_end;
5355      }
5356      PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5357     }
5358    }
5359    sayNO;
5360    /* NOTREACHED */
5361
5362
5363   curly_try_B_max:
5364    /* a successful greedy match: now try to match B */
5365    if (cur_eval && cur_eval->u.eval.close_paren &&
5366     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5367     goto fake_end;
5368    }
5369    {
5370     UV c = 0;
5371     if (ST.c1 != CHRTEST_VOID)
5372      c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5373           UTF8_MAXBYTES, 0, uniflags)
5374         : (UV) UCHARAT(PL_reginput);
5375     /* If it could work, try it. */
5376     if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5377      CURLY_SETPAREN(ST.paren, ST.count);
5378      PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5379      /* NOTREACHED */
5380     }
5381    }
5382    /* FALL THROUGH */
5383   case CURLY_B_max_fail:
5384    /* failed to find B in a greedy match */
5385    if (ST.paren && ST.count)
5386     PL_regoffs[ST.paren].end = -1;
5387
5388    REGCP_UNWIND(ST.cp);
5389    /*  back up. */
5390    if (--ST.count < ST.min)
5391     sayNO;
5392    PL_reginput = locinput = HOPc(locinput, -1);
5393    goto curly_try_B_max;
5394
5395 #undef ST
5396
5397   case END:
5398    fake_end:
5399    if (cur_eval) {
5400     /* we've just finished A in /(??{A})B/; now continue with B */
5401     I32 tmpix;
5402     st->u.eval.toggle_reg_flags
5403        = cur_eval->u.eval.toggle_reg_flags;
5404     PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5405
5406     st->u.eval.prev_rex = rex_sv;  /* inner */
5407     SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5408     rex = (struct regexp *)SvANY(rex_sv);
5409     rexi = RXi_GET(rex);
5410     cur_curlyx = cur_eval->u.eval.prev_curlyx;
5411     (void)ReREFCNT_inc(rex_sv);
5412     st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
5413
5414     /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5415     PL_reglastparen = &rex->lastparen;
5416     PL_reglastcloseparen = &rex->lastcloseparen;
5417
5418     REGCP_SET(st->u.eval.lastcp);
5419     PL_reginput = locinput;
5420
5421     /* Restore parens of the outer rex without popping the
5422     * savestack */
5423     tmpix = PL_savestack_ix;
5424     PL_savestack_ix = cur_eval->u.eval.lastcp;
5425     regcppop(rex);
5426     PL_savestack_ix = tmpix;
5427
5428     st->u.eval.prev_eval = cur_eval;
5429     cur_eval = cur_eval->u.eval.prev_eval;
5430     DEBUG_EXECUTE_r(
5431      PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5432          REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5433     if ( nochange_depth )
5434      nochange_depth--;
5435
5436     PUSH_YES_STATE_GOTO(EVAL_AB,
5437       st->u.eval.prev_eval->u.eval.B); /* match B */
5438    }
5439
5440    if (locinput < reginfo->till) {
5441     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5442          "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5443          PL_colors[4],
5444          (long)(locinput - PL_reg_starttry),
5445          (long)(reginfo->till - PL_reg_starttry),
5446          PL_colors[5]));
5447
5448     sayNO_SILENT;  /* Cannot match: too short. */
5449    }
5450    PL_reginput = locinput; /* put where regtry can find it */
5451    sayYES;   /* Success! */
5452
5453   case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5454    DEBUG_EXECUTE_r(
5455    PerlIO_printf(Perl_debug_log,
5456     "%*s  %ssubpattern success...%s\n",
5457     REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5458    PL_reginput = locinput; /* put where regtry can find it */
5459    sayYES;   /* Success! */
5460
5461 #undef  ST
5462 #define ST st->u.ifmatch
5463
5464   case SUSPEND: /* (?>A) */
5465    ST.wanted = 1;
5466    PL_reginput = locinput;
5467    goto do_ifmatch;
5468
5469   case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5470    ST.wanted = 0;
5471    goto ifmatch_trivial_fail_test;
5472
5473   case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5474    ST.wanted = 1;
5475   ifmatch_trivial_fail_test:
5476    if (scan->flags) {
5477     char * const s = HOPBACKc(locinput, scan->flags);
5478     if (!s) {
5479      /* trivial fail */
5480      if (logical) {
5481       logical = 0;
5482       sw = 1 - cBOOL(ST.wanted);
5483      }
5484      else if (ST.wanted)
5485       sayNO;
5486      next = scan + ARG(scan);
5487      if (next == scan)
5488       next = NULL;
5489      break;
5490     }
5491     PL_reginput = s;
5492    }
5493    else
5494     PL_reginput = locinput;
5495
5496   do_ifmatch:
5497    ST.me = scan;
5498    ST.logical = logical;
5499    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5500
5501    /* execute body of (?...A) */
5502    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5503    /* NOTREACHED */
5504
5505   case IFMATCH_A_fail: /* body of (?...A) failed */
5506    ST.wanted = !ST.wanted;
5507    /* FALL THROUGH */
5508
5509   case IFMATCH_A: /* body of (?...A) succeeded */
5510    if (ST.logical) {
5511     sw = cBOOL(ST.wanted);
5512    }
5513    else if (!ST.wanted)
5514     sayNO;
5515
5516    if (OP(ST.me) == SUSPEND)
5517     locinput = PL_reginput;
5518    else {
5519     locinput = PL_reginput = st->locinput;
5520     nextchr = UCHARAT(locinput);
5521    }
5522    scan = ST.me + ARG(ST.me);
5523    if (scan == ST.me)
5524     scan = NULL;
5525    continue; /* execute B */
5526
5527 #undef ST
5528
5529   case LONGJMP:
5530    next = scan + ARG(scan);
5531    if (next == scan)
5532     next = NULL;
5533    break;
5534   case COMMIT:
5535    reginfo->cutpoint = PL_regeol;
5536    /* FALLTHROUGH */
5537   case PRUNE:
5538    PL_reginput = locinput;
5539    if (!scan->flags)
5540     sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5541    PUSH_STATE_GOTO(COMMIT_next,next);
5542    /* NOTREACHED */
5543   case COMMIT_next_fail:
5544    no_final = 1;
5545    /* FALLTHROUGH */
5546   case OPFAIL:
5547    sayNO;
5548    /* NOTREACHED */
5549
5550 #define ST st->u.mark
5551   case MARKPOINT:
5552    ST.prev_mark = mark_state;
5553    ST.mark_name = sv_commit = sv_yes_mark
5554     = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5555    mark_state = st;
5556    ST.mark_loc = PL_reginput = locinput;
5557    PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5558    /* NOTREACHED */
5559   case MARKPOINT_next:
5560    mark_state = ST.prev_mark;
5561    sayYES;
5562    /* NOTREACHED */
5563   case MARKPOINT_next_fail:
5564    if (popmark && sv_eq(ST.mark_name,popmark))
5565    {
5566     if (ST.mark_loc > startpoint)
5567      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5568     popmark = NULL; /* we found our mark */
5569     sv_commit = ST.mark_name;
5570
5571     DEBUG_EXECUTE_r({
5572       PerlIO_printf(Perl_debug_log,
5573        "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5574        REPORT_CODE_OFF+depth*2, "",
5575        PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5576     });
5577    }
5578    mark_state = ST.prev_mark;
5579    sv_yes_mark = mark_state ?
5580     mark_state->u.mark.mark_name : NULL;
5581    sayNO;
5582    /* NOTREACHED */
5583   case SKIP:
5584    PL_reginput = locinput;
5585    if (scan->flags) {
5586     /* (*SKIP) : if we fail we cut here*/
5587     ST.mark_name = NULL;
5588     ST.mark_loc = locinput;
5589     PUSH_STATE_GOTO(SKIP_next,next);
5590    } else {
5591     /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5592     otherwise do nothing.  Meaning we need to scan
5593     */
5594     regmatch_state *cur = mark_state;
5595     SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5596
5597     while (cur) {
5598      if ( sv_eq( cur->u.mark.mark_name,
5599         find ) )
5600      {
5601       ST.mark_name = find;
5602       PUSH_STATE_GOTO( SKIP_next, next );
5603      }
5604      cur = cur->u.mark.prev_mark;
5605     }
5606    }
5607    /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5608    break;
5609   case SKIP_next_fail:
5610    if (ST.mark_name) {
5611     /* (*CUT:NAME) - Set up to search for the name as we
5612     collapse the stack*/
5613     popmark = ST.mark_name;
5614    } else {
5615     /* (*CUT) - No name, we cut here.*/
5616     if (ST.mark_loc > startpoint)
5617      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5618     /* but we set sv_commit to latest mark_name if there
5619     is one so they can test to see how things lead to this
5620     cut */
5621     if (mark_state)
5622      sv_commit=mark_state->u.mark.mark_name;
5623    }
5624    no_final = 1;
5625    sayNO;
5626    /* NOTREACHED */
5627 #undef ST
5628   case FOLDCHAR:
5629    n = ARG(scan);
5630    if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5631     locinput += ln;
5632    } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5633     sayNO;
5634    } else  {
5635     U8 folded[UTF8_MAXBYTES_CASE+1];
5636     STRLEN foldlen;
5637     const char * const l = locinput;
5638     char *e = PL_regeol;
5639     to_uni_fold(n, folded, &foldlen);
5640
5641     if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5642        l, &e, 0,  utf8_target)) {
5643       sayNO;
5644     }
5645     locinput = e;
5646    }
5647    nextchr = UCHARAT(locinput);
5648    break;
5649   case LNBREAK:
5650    if ((n=is_LNBREAK(locinput,utf8_target))) {
5651     locinput += n;
5652     nextchr = UCHARAT(locinput);
5653    } else
5654     sayNO;
5655    break;
5656
5657 #define CASE_CLASS(nAmE)                              \
5658   case nAmE:                                    \
5659    if ((n=is_##nAmE(locinput,utf8_target))) {    \
5660     locinput += n;                        \
5661     nextchr = UCHARAT(locinput);          \
5662    } else                                    \
5663     sayNO;                                \
5664    break;                                    \
5665   case N##nAmE:                                 \
5666    if ((n=is_##nAmE(locinput,utf8_target))) {    \
5667     sayNO;                                \
5668    } else {                                  \
5669     locinput += UTF8SKIP(locinput);       \
5670     nextchr = UCHARAT(locinput);          \
5671    }                                         \
5672    break
5673
5674   CASE_CLASS(VERTWS);
5675   CASE_CLASS(HORIZWS);
5676 #undef CASE_CLASS
5677
5678   default:
5679    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5680       PTR2UV(scan), OP(scan));
5681    Perl_croak(aTHX_ "regexp memory corruption");
5682
5683   } /* end switch */
5684
5685   /* switch break jumps here */
5686   scan = next; /* prepare to execute the next op and ... */
5687   continue;    /* ... jump back to the top, reusing st */
5688   /* NOTREACHED */
5689
5690  push_yes_state:
5691   /* push a state that backtracks on success */
5692   st->u.yes.prev_yes_state = yes_state;
5693   yes_state = st;
5694   /* FALL THROUGH */
5695  push_state:
5696   /* push a new regex state, then continue at scan  */
5697   {
5698    regmatch_state *newst;
5699
5700    DEBUG_STACK_r({
5701     regmatch_state *cur = st;
5702     regmatch_state *curyes = yes_state;
5703     int curd = depth;
5704     regmatch_slab *slab = PL_regmatch_slab;
5705     for (;curd > -1;cur--,curd--) {
5706      if (cur < SLAB_FIRST(slab)) {
5707       slab = slab->prev;
5708       cur = SLAB_LAST(slab);
5709      }
5710      PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5711       REPORT_CODE_OFF + 2 + depth * 2,"",
5712       curd, PL_reg_name[cur->resume_state],
5713       (curyes == cur) ? "yes" : ""
5714      );
5715      if (curyes == cur)
5716       curyes = cur->u.yes.prev_yes_state;
5717     }
5718    } else
5719     DEBUG_STATE_pp("push")
5720    );
5721    depth++;
5722    st->locinput = locinput;
5723    newst = st+1;
5724    if (newst >  SLAB_LAST(PL_regmatch_slab))
5725     newst = S_push_slab(aTHX);
5726    PL_regmatch_state = newst;
5727
5728    locinput = PL_reginput;
5729    nextchr = UCHARAT(locinput);
5730    st = newst;
5731    continue;
5732    /* NOTREACHED */
5733   }
5734  }
5735
5736  /*
5737  * We get here only if there's trouble -- normally "case END" is
5738  * the terminating point.
5739  */
5740  Perl_croak(aTHX_ "corrupted regexp pointers");
5741  /*NOTREACHED*/
5742  sayNO;
5743
5744 yes:
5745  if (yes_state) {
5746   /* we have successfully completed a subexpression, but we must now
5747   * pop to the state marked by yes_state and continue from there */
5748   assert(st != yes_state);
5749 #ifdef DEBUGGING
5750   while (st != yes_state) {
5751    st--;
5752    if (st < SLAB_FIRST(PL_regmatch_slab)) {
5753     PL_regmatch_slab = PL_regmatch_slab->prev;
5754     st = SLAB_LAST(PL_regmatch_slab);
5755    }
5756    DEBUG_STATE_r({
5757     if (no_final) {
5758      DEBUG_STATE_pp("pop (no final)");
5759     } else {
5760      DEBUG_STATE_pp("pop (yes)");
5761     }
5762    });
5763    depth--;
5764   }
5765 #else
5766   while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5767    || yes_state > SLAB_LAST(PL_regmatch_slab))
5768   {
5769    /* not in this slab, pop slab */
5770    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5771    PL_regmatch_slab = PL_regmatch_slab->prev;
5772    st = SLAB_LAST(PL_regmatch_slab);
5773   }
5774   depth -= (st - yes_state);
5775 #endif
5776   st = yes_state;
5777   yes_state = st->u.yes.prev_yes_state;
5778   PL_regmatch_state = st;
5779
5780   if (no_final) {
5781    locinput= st->locinput;
5782    nextchr = UCHARAT(locinput);
5783   }
5784   state_num = st->resume_state + no_final;
5785   goto reenter_switch;
5786  }
5787
5788  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5789       PL_colors[4], PL_colors[5]));
5790
5791  if (PL_reg_eval_set) {
5792   /* each successfully executed (?{...}) block does the equivalent of
5793   *   local $^R = do {...}
5794   * When popping the save stack, all these locals would be undone;
5795   * bypass this by setting the outermost saved $^R to the latest
5796   * value */
5797   if (oreplsv != GvSV(PL_replgv))
5798    sv_setsv(oreplsv, GvSV(PL_replgv));
5799  }
5800  result = 1;
5801  goto final_exit;
5802
5803 no:
5804  DEBUG_EXECUTE_r(
5805   PerlIO_printf(Perl_debug_log,
5806    "%*s  %sfailed...%s\n",
5807    REPORT_CODE_OFF+depth*2, "",
5808    PL_colors[4], PL_colors[5])
5809   );
5810
5811 no_silent:
5812  if (no_final) {
5813   if (yes_state) {
5814    goto yes;
5815   } else {
5816    goto final_exit;
5817   }
5818  }
5819  if (depth) {
5820   /* there's a previous state to backtrack to */
5821   st--;
5822   if (st < SLAB_FIRST(PL_regmatch_slab)) {
5823    PL_regmatch_slab = PL_regmatch_slab->prev;
5824    st = SLAB_LAST(PL_regmatch_slab);
5825   }
5826   PL_regmatch_state = st;
5827   locinput= st->locinput;
5828   nextchr = UCHARAT(locinput);
5829
5830   DEBUG_STATE_pp("pop");
5831   depth--;
5832   if (yes_state == st)
5833    yes_state = st->u.yes.prev_yes_state;
5834
5835   state_num = st->resume_state + 1; /* failure = success + 1 */
5836   goto reenter_switch;
5837  }
5838  result = 0;
5839
5840   final_exit:
5841  if (rex->intflags & PREGf_VERBARG_SEEN) {
5842   SV *sv_err = get_sv("REGERROR", 1);
5843   SV *sv_mrk = get_sv("REGMARK", 1);
5844   if (result) {
5845    sv_commit = &PL_sv_no;
5846    if (!sv_yes_mark)
5847     sv_yes_mark = &PL_sv_yes;
5848   } else {
5849    if (!sv_commit)
5850     sv_commit = &PL_sv_yes;
5851    sv_yes_mark = &PL_sv_no;
5852   }
5853   sv_setsv(sv_err, sv_commit);
5854   sv_setsv(sv_mrk, sv_yes_mark);
5855  }
5856
5857  /* clean up; in particular, free all slabs above current one */
5858  LEAVE_SCOPE(oldsave);
5859
5860  return result;
5861 }
5862
5863 /*
5864  - regrepeat - repeatedly match something simple, report how many
5865  */
5866 /*
5867  * [This routine now assumes that it will only match on things of length 1.
5868  * That was true before, but now we assume scan - reginput is the count,
5869  * rather than incrementing count on every character.  [Er, except utf8.]]
5870  */
5871 STATIC I32
5872 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5873 {
5874  dVAR;
5875  register char *scan;
5876  register I32 c;
5877  register char *loceol = PL_regeol;
5878  register I32 hardcount = 0;
5879  register bool utf8_target = PL_reg_match_utf8;
5880  UV utf8_flags;
5881 #ifndef DEBUGGING
5882  PERL_UNUSED_ARG(depth);
5883 #endif
5884
5885  PERL_ARGS_ASSERT_REGREPEAT;
5886
5887  scan = PL_reginput;
5888  if (max == REG_INFTY)
5889   max = I32_MAX;
5890  else if (max < loceol - scan)
5891   loceol = scan + max;
5892  switch (OP(p)) {
5893  case REG_ANY:
5894   if (utf8_target) {
5895    loceol = PL_regeol;
5896    while (scan < loceol && hardcount < max && *scan != '\n') {
5897     scan += UTF8SKIP(scan);
5898     hardcount++;
5899    }
5900   } else {
5901    while (scan < loceol && *scan != '\n')
5902     scan++;
5903   }
5904   break;
5905  case SANY:
5906   if (utf8_target) {
5907    loceol = PL_regeol;
5908    while (scan < loceol && hardcount < max) {
5909     scan += UTF8SKIP(scan);
5910     hardcount++;
5911    }
5912   }
5913   else
5914    scan = loceol;
5915   break;
5916  case CANY:
5917   scan = loceol;
5918   break;
5919  case EXACT:
5920   /* To get here, EXACTish nodes must have *byte* length == 1.  That
5921   * means they match only characters in the string that can be expressed
5922   * as a single byte.  For non-utf8 strings, that means a simple match.
5923   * For utf8 strings, the character matched must be an invariant, or
5924   * downgradable to a single byte.  The pattern's utf8ness is
5925   * irrelevant, as since it's a single byte, it either isn't utf8, or if
5926   * it is, it's an invariant */
5927
5928   c = (U8)*STRING(p);
5929   assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5930
5931   if (! utf8_target || UNI_IS_INVARIANT(c)) {
5932    while (scan < loceol && UCHARAT(scan) == c) {
5933     scan++;
5934    }
5935   }
5936   else {
5937
5938    /* Here, the string is utf8, and the pattern char is different
5939    * in utf8 than not, so can't compare them directly.  Outside the
5940    * loop, find find the two utf8 bytes that represent c, and then
5941    * look for those in sequence in the utf8 string */
5942    U8 high = UTF8_TWO_BYTE_HI(c);
5943    U8 low = UTF8_TWO_BYTE_LO(c);
5944    loceol = PL_regeol;
5945
5946    while (hardcount < max
5947      && scan + 1 < loceol
5948      && UCHARAT(scan) == high
5949      && UCHARAT(scan + 1) == low)
5950    {
5951     scan += 2;
5952     hardcount++;
5953    }
5954   }
5955   break;
5956  case EXACTFA:
5957   utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5958   goto do_exactf;
5959
5960  case EXACTFL:
5961   PL_reg_flags |= RF_tainted;
5962   utf8_flags = FOLDEQ_UTF8_LOCALE;
5963   goto do_exactf;
5964
5965  case EXACTF:
5966  case EXACTFU:
5967   utf8_flags = 0;
5968
5969   /* The comments for the EXACT case above apply as well to these fold
5970   * ones */
5971
5972  do_exactf:
5973   c = (U8)*STRING(p);
5974   assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5975
5976   if (utf8_target) { /* Use full Unicode fold matching */
5977    char *tmpeol = loceol;
5978    while (hardcount < max
5979      && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
5980         STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
5981    {
5982     scan = tmpeol;
5983     tmpeol = loceol;
5984     hardcount++;
5985    }
5986
5987    /* XXX Note that the above handles properly the German sharp s in
5988    * the pattern matching ss in the string.  But it doesn't handle
5989    * properly cases where the string contains say 'LIGATURE ff' and
5990    * the pattern is 'f+'.  This would require, say, a new function or
5991    * revised interface to foldEQ_utf8(), in which the maximum number
5992    * of characters to match could be passed and it would return how
5993    * many actually did.  This is just one of many cases where
5994    * multi-char folds don't work properly, and so the fix is being
5995    * deferred */
5996   }
5997   else {
5998    U8 folded;
5999
6000    /* Here, the string isn't utf8 and c is a single byte; and either
6001    * the pattern isn't utf8 or c is an invariant, so its utf8ness
6002    * doesn't affect c.  Can just do simple comparisons for exact or
6003    * fold matching. */
6004    switch (OP(p)) {
6005     case EXACTF: folded = PL_fold[c]; break;
6006     case EXACTFA:
6007     case EXACTFU: folded = PL_fold_latin1[c]; break;
6008     case EXACTFL: folded = PL_fold_locale[c]; break;
6009     default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6010    }
6011    while (scan < loceol &&
6012     (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6013    {
6014     scan++;
6015    }
6016   }
6017   break;
6018  case ANYOFV:
6019  case ANYOF:
6020   if (utf8_target || OP(p) == ANYOFV) {
6021    STRLEN inclasslen;
6022    loceol = PL_regeol;
6023    inclasslen = loceol - scan;
6024    while (hardcount < max
6025     && ((inclasslen = loceol - scan) > 0)
6026     && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6027    {
6028     scan += inclasslen;
6029     hardcount++;
6030    }
6031   } else {
6032    while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6033     scan++;
6034   }
6035   break;
6036  case ALNUMU:
6037   if (utf8_target) {
6038  utf8_wordchar:
6039    loceol = PL_regeol;
6040    LOAD_UTF8_CHARCLASS_ALNUM();
6041    while (hardcount < max && scan < loceol &&
6042     swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6043    {
6044     scan += UTF8SKIP(scan);
6045     hardcount++;
6046    }
6047   } else {
6048    while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6049     scan++;
6050    }
6051   }
6052   break;
6053  case ALNUM:
6054   if (utf8_target)
6055    goto utf8_wordchar;
6056   while (scan < loceol && isALNUM((U8) *scan)) {
6057    scan++;
6058   }
6059   break;
6060  case ALNUMA:
6061   while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6062    scan++;
6063   }
6064   break;
6065  case ALNUML:
6066   PL_reg_flags |= RF_tainted;
6067   if (utf8_target) {
6068    loceol = PL_regeol;
6069    while (hardcount < max && scan < loceol &&
6070     isALNUM_LC_utf8((U8*)scan)) {
6071     scan += UTF8SKIP(scan);
6072     hardcount++;
6073    }
6074   } else {
6075    while (scan < loceol && isALNUM_LC(*scan))
6076     scan++;
6077   }
6078   break;
6079  case NALNUMU:
6080   if (utf8_target) {
6081
6082  utf8_Nwordchar:
6083
6084    loceol = PL_regeol;
6085    LOAD_UTF8_CHARCLASS_ALNUM();
6086    while (hardcount < max && scan < loceol &&
6087     ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6088    {
6089     scan += UTF8SKIP(scan);
6090     hardcount++;
6091    }
6092   } else {
6093    while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6094     scan++;
6095    }
6096   }
6097   break;
6098  case NALNUM:
6099   if (utf8_target)
6100    goto utf8_Nwordchar;
6101   while (scan < loceol && ! isALNUM((U8) *scan)) {
6102    scan++;
6103   }
6104   break;
6105  case NALNUMA:
6106   if (utf8_target) {
6107    while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6108     scan += UTF8SKIP(scan);
6109    }
6110   }
6111   else {
6112    while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6113     scan++;
6114    }
6115   }
6116   break;
6117  case NALNUML:
6118   PL_reg_flags |= RF_tainted;
6119   if (utf8_target) {
6120    loceol = PL_regeol;
6121    while (hardcount < max && scan < loceol &&
6122     !isALNUM_LC_utf8((U8*)scan)) {
6123     scan += UTF8SKIP(scan);
6124     hardcount++;
6125    }
6126   } else {
6127    while (scan < loceol && !isALNUM_LC(*scan))
6128     scan++;
6129   }
6130   break;
6131  case SPACEU:
6132   if (utf8_target) {
6133
6134  utf8_space:
6135
6136    loceol = PL_regeol;
6137    LOAD_UTF8_CHARCLASS_SPACE();
6138    while (hardcount < max && scan < loceol &&
6139     (*scan == ' ' ||
6140      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6141    {
6142     scan += UTF8SKIP(scan);
6143     hardcount++;
6144    }
6145    break;
6146   }
6147   else {
6148    while (scan < loceol && isSPACE_L1((U8) *scan)) {
6149     scan++;
6150    }
6151    break;
6152   }
6153  case SPACE:
6154   if (utf8_target)
6155    goto utf8_space;
6156
6157   while (scan < loceol && isSPACE((U8) *scan)) {
6158    scan++;
6159   }
6160   break;
6161  case SPACEA:
6162   while (scan < loceol && isSPACE_A((U8) *scan)) {
6163    scan++;
6164   }
6165   break;
6166  case SPACEL:
6167   PL_reg_flags |= RF_tainted;
6168   if (utf8_target) {
6169    loceol = PL_regeol;
6170    while (hardcount < max && scan < loceol &&
6171     isSPACE_LC_utf8((U8*)scan)) {
6172     scan += UTF8SKIP(scan);
6173     hardcount++;
6174    }
6175   } else {
6176    while (scan < loceol && isSPACE_LC(*scan))
6177     scan++;
6178   }
6179   break;
6180  case NSPACEU:
6181   if (utf8_target) {
6182
6183  utf8_Nspace:
6184
6185    loceol = PL_regeol;
6186    LOAD_UTF8_CHARCLASS_SPACE();
6187    while (hardcount < max && scan < loceol &&
6188     ! (*scan == ' ' ||
6189      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6190    {
6191     scan += UTF8SKIP(scan);
6192     hardcount++;
6193    }
6194    break;
6195   }
6196   else {
6197    while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6198     scan++;
6199    }
6200   }
6201   break;
6202  case NSPACE:
6203   if (utf8_target)
6204    goto utf8_Nspace;
6205
6206   while (scan < loceol && ! isSPACE((U8) *scan)) {
6207    scan++;
6208   }
6209   break;
6210  case NSPACEA:
6211   if (utf8_target) {
6212    while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6213     scan += UTF8SKIP(scan);
6214    }
6215   }
6216   else {
6217    while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6218     scan++;
6219    }
6220   }
6221   break;
6222  case NSPACEL:
6223   PL_reg_flags |= RF_tainted;
6224   if (utf8_target) {
6225    loceol = PL_regeol;
6226    while (hardcount < max && scan < loceol &&
6227     !isSPACE_LC_utf8((U8*)scan)) {
6228     scan += UTF8SKIP(scan);
6229     hardcount++;
6230    }
6231   } else {
6232    while (scan < loceol && !isSPACE_LC(*scan))
6233     scan++;
6234   }
6235   break;
6236  case DIGIT:
6237   if (utf8_target) {
6238    loceol = PL_regeol;
6239    LOAD_UTF8_CHARCLASS_DIGIT();
6240    while (hardcount < max && scan < loceol &&
6241     swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6242     scan += UTF8SKIP(scan);
6243     hardcount++;
6244    }
6245   } else {
6246    while (scan < loceol && isDIGIT(*scan))
6247     scan++;
6248   }
6249   break;
6250  case DIGITA:
6251   while (scan < loceol && isDIGIT_A((U8) *scan)) {
6252    scan++;
6253   }
6254   break;
6255  case DIGITL:
6256   PL_reg_flags |= RF_tainted;
6257   if (utf8_target) {
6258    loceol = PL_regeol;
6259    while (hardcount < max && scan < loceol &&
6260     isDIGIT_LC_utf8((U8*)scan)) {
6261     scan += UTF8SKIP(scan);
6262     hardcount++;
6263    }
6264   } else {
6265    while (scan < loceol && isDIGIT_LC(*scan))
6266     scan++;
6267   }
6268   break;
6269  case NDIGIT:
6270   if (utf8_target) {
6271    loceol = PL_regeol;
6272    LOAD_UTF8_CHARCLASS_DIGIT();
6273    while (hardcount < max && scan < loceol &&
6274     !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6275     scan += UTF8SKIP(scan);
6276     hardcount++;
6277    }
6278   } else {
6279    while (scan < loceol && !isDIGIT(*scan))
6280     scan++;
6281   }
6282   break;
6283  case NDIGITA:
6284   if (utf8_target) {
6285    while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6286     scan += UTF8SKIP(scan);
6287    }
6288   }
6289   else {
6290    while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6291     scan++;
6292    }
6293   }
6294   break;
6295  case NDIGITL:
6296   PL_reg_flags |= RF_tainted;
6297   if (utf8_target) {
6298    loceol = PL_regeol;
6299    while (hardcount < max && scan < loceol &&
6300     !isDIGIT_LC_utf8((U8*)scan)) {
6301     scan += UTF8SKIP(scan);
6302     hardcount++;
6303    }
6304   } else {
6305    while (scan < loceol && !isDIGIT_LC(*scan))
6306     scan++;
6307   }
6308   break;
6309  case LNBREAK:
6310   if (utf8_target) {
6311    loceol = PL_regeol;
6312    while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6313     scan += c;
6314     hardcount++;
6315    }
6316   } else {
6317    /*
6318    LNBREAK can match two latin chars, which is ok,
6319    because we have a null terminated string, but we
6320    have to use hardcount in this situation
6321    */
6322    while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6323     scan+=c;
6324     hardcount++;
6325    }
6326   }
6327   break;
6328  case HORIZWS:
6329   if (utf8_target) {
6330    loceol = PL_regeol;
6331    while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6332     scan += c;
6333     hardcount++;
6334    }
6335   } else {
6336    while (scan < loceol && is_HORIZWS_latin1(scan))
6337     scan++;
6338   }
6339   break;
6340  case NHORIZWS:
6341   if (utf8_target) {
6342    loceol = PL_regeol;
6343    while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6344     scan += UTF8SKIP(scan);
6345     hardcount++;
6346    }
6347   } else {
6348    while (scan < loceol && !is_HORIZWS_latin1(scan))
6349     scan++;
6350
6351   }
6352   break;
6353  case VERTWS:
6354   if (utf8_target) {
6355    loceol = PL_regeol;
6356    while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6357     scan += c;
6358     hardcount++;
6359    }
6360   } else {
6361    while (scan < loceol && is_VERTWS_latin1(scan))
6362     scan++;
6363
6364   }
6365   break;
6366  case NVERTWS:
6367   if (utf8_target) {
6368    loceol = PL_regeol;
6369    while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6370     scan += UTF8SKIP(scan);
6371     hardcount++;
6372    }
6373   } else {
6374    while (scan < loceol && !is_VERTWS_latin1(scan))
6375     scan++;
6376
6377   }
6378   break;
6379
6380  default:  /* Called on something of 0 width. */
6381   break;  /* So match right here or not at all. */
6382  }
6383
6384  if (hardcount)
6385   c = hardcount;
6386  else
6387   c = scan - PL_reginput;
6388  PL_reginput = scan;
6389
6390  DEBUG_r({
6391   GET_RE_DEBUG_FLAGS_DECL;
6392   DEBUG_EXECUTE_r({
6393    SV * const prop = sv_newmortal();
6394    regprop(prog, prop, p);
6395    PerlIO_printf(Perl_debug_log,
6396       "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6397       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6398   });
6399  });
6400
6401  return(c);
6402 }
6403
6404
6405 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6406 /*
6407 - regclass_swash - prepare the utf8 swash
6408 */
6409
6410 SV *
6411 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6412 {
6413  dVAR;
6414  SV *sw  = NULL;
6415  SV *si  = NULL;
6416  SV *alt = NULL;
6417  RXi_GET_DECL(prog,progi);
6418  const struct reg_data * const data = prog ? progi->data : NULL;
6419
6420  PERL_ARGS_ASSERT_REGCLASS_SWASH;
6421
6422  assert(ANYOF_NONBITMAP(node));
6423
6424  if (data && data->count) {
6425   const U32 n = ARG(node);
6426
6427   if (data->what[n] == 's') {
6428    SV * const rv = MUTABLE_SV(data->data[n]);
6429    AV * const av = MUTABLE_AV(SvRV(rv));
6430    SV **const ary = AvARRAY(av);
6431    SV **a, **b;
6432
6433    /* See the end of regcomp.c:S_regclass() for
6434    * documentation of these array elements. */
6435
6436    si = *ary;
6437    a  = SvROK(ary[1]) ? &ary[1] : NULL;
6438    b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6439
6440    if (a)
6441     sw = *a;
6442    else if (si && doinit) {
6443     sw = swash_init("utf8", "", si, 1, 0);
6444     (void)av_store(av, 1, sw);
6445    }
6446    if (b)
6447     alt = *b;
6448   }
6449  }
6450
6451  if (listsvp)
6452   *listsvp = si;
6453  if (altsvp)
6454   *altsvp  = alt;
6455
6456  return sw;
6457 }
6458 #endif
6459
6460 /*
6461  - reginclass - determine if a character falls into a character class
6462
6463   n is the ANYOF regnode
6464   p is the target string
6465   lenp is pointer to the maximum number of bytes of how far to go in p
6466  (This is assumed wthout checking to always be at least the current
6467  character's size)
6468   utf8_target tells whether p is in UTF-8.
6469
6470   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6471   from a successful match, the value it points to will be updated to how many
6472   bytes in p were matched.  If there was no match, the value is undefined,
6473   possibly changed from the input.
6474
6475   Note that this can be a synthetic start class, a combination of various
6476   nodes, so things you think might be mutually exclusive, such as locale,
6477   aren't.  It can match both locale and non-locale
6478
6479  */
6480
6481 STATIC bool
6482 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6483 {
6484  dVAR;
6485  const char flags = ANYOF_FLAGS(n);
6486  bool match = FALSE;
6487  UV c = *p;
6488  STRLEN c_len = 0;
6489  STRLEN maxlen;
6490
6491  PERL_ARGS_ASSERT_REGINCLASS;
6492
6493  /* If c is not already the code point, get it */
6494  if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6495   c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6496     (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6497     | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6498     /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6499     * UTF8_ALLOW_FFFF */
6500   if (c_len == (STRLEN)-1)
6501    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6502  }
6503  else {
6504   c_len = 1;
6505  }
6506
6507  /* Use passed in max length, or one character if none passed in or less
6508  * than one character.  And assume will match just one character.  This is
6509  * overwritten later if matched more. */
6510  if (lenp) {
6511   maxlen = (*lenp > c_len) ? *lenp : c_len;
6512   *lenp = c_len;
6513
6514  }
6515  else {
6516   maxlen = c_len;
6517  }
6518
6519  /* If this character is potentially in the bitmap, check it */
6520  if (c < 256) {
6521   if (ANYOF_BITMAP_TEST(n, c))
6522    match = TRUE;
6523   else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6524     && ! utf8_target
6525     && ! isASCII(c))
6526   {
6527    match = TRUE;
6528   }
6529
6530   else if (flags & ANYOF_LOCALE) {
6531    PL_reg_flags |= RF_tainted;
6532
6533    if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6534     && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6535    {
6536     match = TRUE;
6537    }
6538    else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6539      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6540      (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6541      (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6542      (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6543      (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6544      (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6545      (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6546      (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6547      (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6548      (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6549      (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6550      (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6551      (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6552      (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6553      (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6554      (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6555      (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6556      (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6557      (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6558      (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6559      (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6560      (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6561      (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6562      (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6563      (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6564      (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6565      (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6566      (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6567      (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6568      (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6569      ) /* How's that for a conditional? */
6570    ) {
6571     match = TRUE;
6572    }
6573   }
6574  }
6575
6576  /* If the bitmap didn't (or couldn't) match, and something outside the
6577  * bitmap could match, try that.  Locale nodes specifiy completely the
6578  * behavior of code points in the bit map (otherwise, a utf8 target would
6579  * cause them to be treated as Unicode and not locale), except in
6580  * the very unlikely event when this node is a synthetic start class, which
6581  * could be a combination of locale and non-locale nodes.  So allow locale
6582  * to match for the synthetic start class, which will give a false
6583  * positive that will be resolved when the match is done again as not part
6584  * of the synthetic start class */
6585  if (!match) {
6586   if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6587    match = TRUE; /* Everything above 255 matches */
6588   }
6589   else if (ANYOF_NONBITMAP(n)
6590     && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6591      || (utf8_target
6592       && (c >=256
6593        || (! (flags & ANYOF_LOCALE))
6594        || (flags & ANYOF_IS_SYNTHETIC)))))
6595   {
6596    AV *av;
6597    SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6598
6599    if (sw) {
6600     U8 * utf8_p;
6601     if (utf8_target) {
6602      utf8_p = (U8 *) p;
6603     } else {
6604
6605      /* Not utf8.  Convert as much of the string as available up
6606      * to the limit of how far the (single) character in the
6607      * pattern can possibly match (no need to go further).  If
6608      * the node is a straight ANYOF or not folding, it can't
6609      * match more than one.  Otherwise, It can match up to how
6610      * far a single char can fold to.  Since not utf8, each
6611      * character is a single byte, so the max it can be in
6612      * bytes is the same as the max it can be in characters */
6613      STRLEN len = (OP(n) == ANYOF
6614         || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6615         ? 1
6616         : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6617          ? maxlen
6618          : UTF8_MAX_FOLD_CHAR_EXPAND;
6619      utf8_p = bytes_to_utf8(p, &len);
6620     }
6621
6622     if (swash_fetch(sw, utf8_p, TRUE))
6623      match = TRUE;
6624     else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6625
6626      /* Here, we need to test if the fold of the target string
6627      * matches.  The non-multi char folds have all been moved to
6628      * the compilation phase, and the multi-char folds have
6629      * been stored by regcomp into 'av'; we linearly check to
6630      * see if any match the target string (folded).   We know
6631      * that the originals were each one character, but we don't
6632      * currently know how many characters/bytes each folded to,
6633      * except we do know that there are small limits imposed by
6634      * Unicode.  XXX A performance enhancement would be to have
6635      * regcomp.c store the max number of chars/bytes that are
6636      * in an av entry, as, say the 0th element.  Even better
6637      * would be to have a hash of the few characters that can
6638      * start a multi-char fold to the max number of chars of
6639      * those folds.
6640      *
6641      * If there is a match, we will need to advance (if lenp is
6642      * specified) the match pointer in the target string.  But
6643      * what we are comparing here isn't that string directly,
6644      * but its fold, whose length may differ from the original.
6645      * As we go along in constructing the fold, therefore, we
6646      * create a map so that we know how many bytes in the
6647      * source to advance given that we have matched a certain
6648      * number of bytes in the fold.  This map is stored in
6649      * 'map_fold_len_back'.  Let n mean the number of bytes in
6650      * the fold of the first character that we are folding.
6651      * Then map_fold_len_back[n] is set to the number of bytes
6652      * in that first character.  Similarly let m be the
6653      * corresponding number for the second character to be
6654      * folded.  Then map_fold_len_back[n+m] is set to the
6655      * number of bytes occupied by the first two source
6656      * characters. ... */
6657      U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6658      U8 folded[UTF8_MAXBYTES_CASE+1];
6659      STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6660      STRLEN total_foldlen = 0; /* num bytes in fold of all
6661             chars */
6662
6663      if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6664
6665       /* Here, only need to fold the first char of the target
6666       * string.  It the source wasn't utf8, is 1 byte long */
6667       to_utf8_fold(utf8_p, folded, &foldlen);
6668       total_foldlen = foldlen;
6669       map_fold_len_back[foldlen] = (utf8_target)
6670              ? UTF8SKIP(utf8_p)
6671              : 1;
6672      }
6673      else {
6674
6675       /* Here, need to fold more than the first char.  Do so
6676       * up to the limits */
6677       U8* source_ptr = utf8_p;    /* The source for the fold
6678              is the regex target
6679              string */
6680       U8* folded_ptr = folded;
6681       U8* e = utf8_p + maxlen;    /* Can't go beyond last
6682              available byte in the
6683              target string */
6684       U8 i;
6685       for (i = 0;
6686        i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6687        i++)
6688       {
6689
6690        /* Fold the next character */
6691        U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6692        STRLEN this_char_foldlen;
6693        to_utf8_fold(source_ptr,
6694           this_char_folded,
6695           &this_char_foldlen);
6696
6697        /* Bail if it would exceed the byte limit for
6698        * folding a single char. */
6699        if (this_char_foldlen + folded_ptr - folded >
6700                UTF8_MAXBYTES_CASE)
6701        {
6702         break;
6703        }
6704
6705        /* Add the fold of this character */
6706        Copy(this_char_folded,
6707         folded_ptr,
6708         this_char_foldlen,
6709         U8);
6710        source_ptr += UTF8SKIP(source_ptr);
6711        folded_ptr += this_char_foldlen;
6712        total_foldlen = folded_ptr - folded;
6713
6714        /* Create map from the number of bytes in the fold
6715        * back to the number of bytes in the source.  If
6716        * the source isn't utf8, the byte count is just
6717        * the number of characters so far */
6718        map_fold_len_back[total_foldlen]
6719              = (utf8_target)
6720               ? source_ptr - utf8_p
6721               : i + 1;
6722       }
6723       *folded_ptr = '\0';
6724      }
6725
6726
6727      /* Do the linear search to see if the fold is in the list
6728      * of multi-char folds. */
6729      if (av) {
6730       I32 i;
6731       for (i = 0; i <= av_len(av); i++) {
6732        SV* const sv = *av_fetch(av, i, FALSE);
6733        STRLEN len;
6734        const char * const s = SvPV_const(sv, len);
6735
6736        if (len <= total_foldlen
6737         && memEQ(s, (char*)folded, len)
6738
6739         /* If 0, means matched a partial char. See
6740          * [perl #90536] */
6741         && map_fold_len_back[len])
6742        {
6743
6744         /* Advance the target string ptr to account for
6745         * this fold, but have to translate from the
6746         * folded length to the corresponding source
6747         * length. */
6748         if (lenp) {
6749          *lenp = map_fold_len_back[len];
6750         }
6751         match = TRUE;
6752         break;
6753        }
6754       }
6755      }
6756     }
6757
6758     /* If we allocated a string above, free it */
6759     if (! utf8_target) Safefree(utf8_p);
6760    }
6761   }
6762  }
6763
6764  return (flags & ANYOF_INVERT) ? !match : match;
6765 }
6766
6767 STATIC U8 *
6768 S_reghop3(U8 *s, I32 off, const U8* lim)
6769 {
6770  dVAR;
6771
6772  PERL_ARGS_ASSERT_REGHOP3;
6773
6774  if (off >= 0) {
6775   while (off-- && s < lim) {
6776    /* XXX could check well-formedness here */
6777    s += UTF8SKIP(s);
6778   }
6779  }
6780  else {
6781   while (off++ && s > lim) {
6782    s--;
6783    if (UTF8_IS_CONTINUED(*s)) {
6784     while (s > lim && UTF8_IS_CONTINUATION(*s))
6785      s--;
6786    }
6787    /* XXX could check well-formedness here */
6788   }
6789  }
6790  return s;
6791 }
6792
6793 #ifdef XXX_dmq
6794 /* there are a bunch of places where we use two reghop3's that should
6795    be replaced with this routine. but since thats not done yet
6796    we ifdef it out - dmq
6797 */
6798 STATIC U8 *
6799 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6800 {
6801  dVAR;
6802
6803  PERL_ARGS_ASSERT_REGHOP4;
6804
6805  if (off >= 0) {
6806   while (off-- && s < rlim) {
6807    /* XXX could check well-formedness here */
6808    s += UTF8SKIP(s);
6809   }
6810  }
6811  else {
6812   while (off++ && s > llim) {
6813    s--;
6814    if (UTF8_IS_CONTINUED(*s)) {
6815     while (s > llim && UTF8_IS_CONTINUATION(*s))
6816      s--;
6817    }
6818    /* XXX could check well-formedness here */
6819   }
6820  }
6821  return s;
6822 }
6823 #endif
6824
6825 STATIC U8 *
6826 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6827 {
6828  dVAR;
6829
6830  PERL_ARGS_ASSERT_REGHOPMAYBE3;
6831
6832  if (off >= 0) {
6833   while (off-- && s < lim) {
6834    /* XXX could check well-formedness here */
6835    s += UTF8SKIP(s);
6836   }
6837   if (off >= 0)
6838    return NULL;
6839  }
6840  else {
6841   while (off++ && s > lim) {
6842    s--;
6843    if (UTF8_IS_CONTINUED(*s)) {
6844     while (s > lim && UTF8_IS_CONTINUATION(*s))
6845      s--;
6846    }
6847    /* XXX could check well-formedness here */
6848   }
6849   if (off <= 0)
6850    return NULL;
6851  }
6852  return s;
6853 }
6854
6855 static void
6856 restore_pos(pTHX_ void *arg)
6857 {
6858  dVAR;
6859  regexp * const rex = (regexp *)arg;
6860  if (PL_reg_eval_set) {
6861   if (PL_reg_oldsaved) {
6862    rex->subbeg = PL_reg_oldsaved;
6863    rex->sublen = PL_reg_oldsavedlen;
6864 #ifdef PERL_OLD_COPY_ON_WRITE
6865    rex->saved_copy = PL_nrs;
6866 #endif
6867    RXp_MATCH_COPIED_on(rex);
6868   }
6869   PL_reg_magic->mg_len = PL_reg_oldpos;
6870   PL_reg_eval_set = 0;
6871   PL_curpm = PL_reg_oldcurpm;
6872  }
6873 }
6874
6875 STATIC void
6876 S_to_utf8_substr(pTHX_ register regexp *prog)
6877 {
6878  int i = 1;
6879
6880  PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6881
6882  do {
6883   if (prog->substrs->data[i].substr
6884    && !prog->substrs->data[i].utf8_substr) {
6885    SV* const sv = newSVsv(prog->substrs->data[i].substr);
6886    prog->substrs->data[i].utf8_substr = sv;
6887    sv_utf8_upgrade(sv);
6888    if (SvVALID(prog->substrs->data[i].substr)) {
6889     const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6890     if (flags & FBMcf_TAIL) {
6891      /* Trim the trailing \n that fbm_compile added last
6892      time.  */
6893      SvCUR_set(sv, SvCUR(sv) - 1);
6894      /* Whilst this makes the SV technically "invalid" (as its
6895      buffer is no longer followed by "\0") when fbm_compile()
6896      adds the "\n" back, a "\0" is restored.  */
6897     }
6898     fbm_compile(sv, flags);
6899    }
6900    if (prog->substrs->data[i].substr == prog->check_substr)
6901     prog->check_utf8 = sv;
6902   }
6903  } while (i--);
6904 }
6905
6906 STATIC void
6907 S_to_byte_substr(pTHX_ register regexp *prog)
6908 {
6909  dVAR;
6910  int i = 1;
6911
6912  PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6913
6914  do {
6915   if (prog->substrs->data[i].utf8_substr
6916    && !prog->substrs->data[i].substr) {
6917    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6918    if (sv_utf8_downgrade(sv, TRUE)) {
6919     if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6920      const U8 flags
6921       = BmFLAGS(prog->substrs->data[i].utf8_substr);
6922      if (flags & FBMcf_TAIL) {
6923       /* Trim the trailing \n that fbm_compile added last
6924       time.  */
6925       SvCUR_set(sv, SvCUR(sv) - 1);
6926      }
6927      fbm_compile(sv, flags);
6928     }
6929    } else {
6930     SvREFCNT_dec(sv);
6931     sv = &PL_sv_undef;
6932    }
6933    prog->substrs->data[i].substr = sv;
6934    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6935     prog->check_substr = sv;
6936   }
6937  } while (i--);
6938 }
6939
6940 /*
6941  * Local variables:
6942  * c-indentation-style: bsd
6943  * c-basic-offset: 4
6944  * indent-tabs-mode: t
6945  * End:
6946  *
6947  * ex: set ts=8 sts=4 sw=4 noet:
6948  */