src/5014000/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *  One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  * Copyright (c) 1986 by University of Toronto.
  44  * Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  * Permission is granted to anyone to use this software for any
  47  * purpose on any computer system, and to redistribute it freely,
  48  * subject to the following restrictions:
  49  *
  50  * 1. The author is not responsible for the consequences of use of
  51  *  this software, no matter how awful, even if they arise
  52  *  from defects in it.
  53  *
  54  * 2. The origin of this software must not be misrepresented, either
  55  *  by explicit claim or by omission.
  56  *
  57  * 3. Altered versions must be plainly marked as such, and must not
  58  *  be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #define RF_tainted 1 /* tainted information used? e.g. locale */
  85 #define RF_warned 2  /* warned about big count? */
  86
  87 #define RF_utf8  8  /* Pattern contains multibyte chars? */
  88
  89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
  90
  91 #define RS_init  1  /* eval environment created */
  92 #define RS_set  2  /* replsv value is set */
  93
  94 #ifndef STATIC
  95 #define STATIC static
  96 #endif
  97
  98 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
  99  * call if there are no complications: i.e., if everything matchable is
 100  * straight forward in the bitmap */
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0)   \
 102            : ANYOF_BITMAP_TEST(p,*(c)))
 103
 104 /*
 105  * Forwards.
 106  */
 107
 108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 110
 111 #define HOPc(pos,off) \
 112   (char *)(PL_reg_match_utf8 \
 113    ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 114    : (U8*)(pos + off))
 115 #define HOPBACKc(pos, off) \
 116   (char*)(PL_reg_match_utf8\
 117    ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 118    : (pos - off >= PL_bostr)  \
 119     ? (U8*)pos - off  \
 120     : NULL)
 121
 122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 124
 125 /* these are unrolled below in the CCC_TRY_XXX defined */
 126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 127  if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 128
 129 /* Doesn't do an assert to verify that is correct */
 130 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 131  if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 132
 133 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 134 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 135 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 136
 137 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 138   LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 139   LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 140   /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 141    * assert should likely and hopefully fail on an EBCDIC machine */ \
 142   LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 143                    \
 144   /* No asserts are done for these, in case called on an early        \
 145    * Unicode version in which they map to nothing */               \
 146   LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 147   LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);     /* U+1100 "\xe1\x84\x80" */ \
 148   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 149   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 150   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 151   LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 152   LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 153
 154 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
 155
 156 /* The actual code for CCC_TRY, which uses several variables from the routine
 157  * it's callable from.  It is designed to be the bulk of a case statement.
 158  * FUNC is the macro or function to call on non-utf8 targets that indicate if
 159  *      nextchr matches the class.
 160  * UTF8_TEST is the whole test string to use for utf8 targets
 161  * LOAD is what to use to test, and if not present to load in the swash for the
 162  * class
 163  * POS_OR_NEG is either empty or ! to complement the results of FUNC or
 164  * UTF8_TEST test.
 165  * The logic is: Fail if we're at the end-of-string; otherwise if the target is
 166  * utf8 and a variant, load the swash if necessary and test using the utf8
 167  * test.  Advance to the next character if test is ok, otherwise fail; If not
 168  * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
 169  * fails, or advance to the next character */
 170
 171 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR)                \
 172  if (locinput >= PL_regeol) {                                              \
 173   sayNO;                                                                \
 174  }                                                                         \
 175  if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {                          \
 176   LOAD_UTF8_CHARCLASS(CLASS, STR);                                      \
 177   if (POS_OR_NEG (UTF8_TEST)) {                                         \
 178    sayNO;                                                            \
 179   }                                                                     \
 180   locinput += PL_utf8skip[nextchr];                                     \
 181   nextchr = UCHARAT(locinput);                                          \
 182   break;                                                                \
 183  }                                                                         \
 184  if (POS_OR_NEG (FUNC(nextchr))) {                                         \
 185   sayNO;                                                                \
 186  }                                                                         \
 187  nextchr = UCHARAT(++locinput);                                            \
 188  break;
 189
 190 /* Handle the non-locale cases for a character class and its complement.  It
 191  * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
 192  * This is because that code fails when the test succeeds, so we want to have
 193  * the test fail so that the code succeeds.  The swash is stored in a
 194  * predictable PL_ place */
 195 #define _CCC_TRY_NONLOCALE(NAME,  NNAME,  FUNC,                               \
 196       CLASS, STR)                                        \
 197  case NAME:                                                                \
 198   _CCC_TRY_CODE( !, FUNC,                                               \
 199       cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 200            (U8*)locinput, TRUE)),            \
 201       CLASS, STR)                                         \
 202  case NNAME:                                                               \
 203   _CCC_TRY_CODE(  PLACEHOLDER , FUNC,                                   \
 204       cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS),             \
 205            (U8*)locinput, TRUE)),            \
 206       CLASS, STR)                                         \
 207
 208 /* Generate the case statements for both locale and non-locale character
 209  * classes in regmatch for classes that don't have special unicode semantics.
 210  * Locales don't use an immediate swash, but an intermediary special locale
 211  * function that is called on the pointer to the current place in the input
 212  * string.  That function will resolve to needing the same swash.  One might
 213  * think that because we don't know what the locale will match, we shouldn't
 214  * check with the swash loading function that it loaded properly; ie, that we
 215  * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
 216  * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
 217  * irrelevant here */
 218 #define CCC_TRY(NAME,  NNAME,  FUNC,                                          \
 219     NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                           \
 220     NAMEA, NNAMEA, FUNCA,                                         \
 221     CLASS, STR)                                                   \
 222  case NAMEL:                                                               \
 223   PL_reg_flags |= RF_tainted;                                           \
 224   _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR)     \
 225  case NNAMEL:                                                              \
 226   PL_reg_flags |= RF_tainted;                                           \
 227   _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput),       \
 228      CLASS, STR)                                            \
 229  case NAMEA:                                                               \
 230   if (locinput >= PL_regeol || ! FUNCA(nextchr)) {                      \
 231    sayNO;                                                            \
 232   }                                                                     \
 233   /* Matched a utf8-invariant, so don't have to worry about utf8 */     \
 234   nextchr = UCHARAT(++locinput);                                        \
 235   break;                                                                \
 236  case NNAMEA:                                                              \
 237   if (locinput >= PL_regeol || FUNCA(nextchr)) {                        \
 238    sayNO;                                                            \
 239   }                                                                     \
 240   if (utf8_target) {                                                    \
 241    locinput += PL_utf8skip[nextchr];                                 \
 242    nextchr = UCHARAT(locinput);                                      \
 243   }                                                                     \
 244   else {                                                                \
 245    nextchr = UCHARAT(++locinput);                                    \
 246   }                                                                     \
 247   break;                                                                \
 248  /* Generate the non-locale cases */                                       \
 249  _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
 250
 251 /* This is like CCC_TRY, but has an extra set of parameters for generating case
 252  * statements to handle separate Unicode semantics nodes */
 253 #define CCC_TRY_U(NAME,  NNAME,  FUNC,                                         \
 254     NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                          \
 255     NAMEU, NNAMEU, FUNCU,                                        \
 256     NAMEA, NNAMEA, FUNCA,                                        \
 257     CLASS, STR)                                                  \
 258  CCC_TRY(NAME, NNAME, FUNC,                                                 \
 259    NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8,                                \
 260    NAMEA, NNAMEA, FUNCA,                                              \
 261    CLASS, STR)                                                        \
 262  _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
 263
 264 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 265
 266 /* for use after a quantifier and before an EXACT-like node -- japhy */
 267 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 268  *
 269  * NOTE that *nothing* that affects backtracking should be in here, specifically
 270  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 271  * node that is in between two EXACT like nodes when ascertaining what the required
 272  * "follow" character is. This should probably be moved to regex compile time
 273  * although it may be done at run time beause of the REF possibility - more
 274  * investigation required. -- demerphq
 275 */
 276 #define JUMPABLE(rn) (      \
 277  OP(rn) == OPEN ||       \
 278  (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 279  OP(rn) == EVAL ||   \
 280  OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 281  OP(rn) == PLUS || OP(rn) == MINMOD || \
 282  OP(rn) == KEEPS || \
 283  (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 284 )
 285 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 286
 287 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 288
 289 #if 0
 290 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 291    we don't need this definition. */
 292 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 293 #define IS_TEXTF(rn)  ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA ||  OP(rn)==EXACTF)  || OP(rn)==REFF  || OP(rn)==NREFF )
 294 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 295
 296 #else
 297 /* ... so we use this as its faster. */
 298 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 299 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn) == EXACTFA)
 300 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 301 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 302
 303 #endif
 304
 305 /*
 306   Search for mandatory following text node; for lookahead, the text must
 307   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 308 */
 309 #define FIND_NEXT_IMPT(rn) STMT_START { \
 310  while (JUMPABLE(rn)) { \
 311   const OPCODE type = OP(rn); \
 312   if (type == SUSPEND || PL_regkind[type] == CURLY) \
 313    rn = NEXTOPER(NEXTOPER(rn)); \
 314   else if (type == PLUS) \
 315    rn = NEXTOPER(rn); \
 316   else if (type == IFMATCH) \
 317    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 318   else rn += NEXT_OFF(rn); \
 319  } \
 320 } STMT_END
 321
 322
 323 static void restore_pos(pTHX_ void *arg);
 324
 325 #define REGCP_PAREN_ELEMS 4
 326 #define REGCP_OTHER_ELEMS 5
 327 #define REGCP_FRAME_ELEMS 1
 328 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 329  * are needed for the regexp context stack bookkeeping. */
 330
 331 STATIC CHECKPOINT
 332 S_regcppush(pTHX_ I32 parenfloor)
 333 {
 334  dVAR;
 335  const int retval = PL_savestack_ix;
 336  const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 337  const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 338  const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 339  int p;
 340  GET_RE_DEBUG_FLAGS_DECL;
 341
 342  if (paren_elems_to_push < 0)
 343   Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 344
 345  if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 346   Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 347     " out of range (%lu-%ld)",
 348     total_elems, (unsigned long)PL_regsize, (long)parenfloor);
 349
 350  SSGROW(total_elems + REGCP_FRAME_ELEMS);
 351
 352  for (p = PL_regsize; p > parenfloor; p--) {
 353 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 354   SSPUSHINT(PL_regoffs[p].end);
 355   SSPUSHINT(PL_regoffs[p].start);
 356   SSPUSHPTR(PL_reg_start_tmp[p]);
 357   SSPUSHINT(p);
 358   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 359   "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 360      (UV)p, (IV)PL_regoffs[p].start,
 361      (IV)(PL_reg_start_tmp[p] - PL_bostr),
 362      (IV)PL_regoffs[p].end
 363   ));
 364  }
 365 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 366  SSPUSHPTR(PL_regoffs);
 367  SSPUSHINT(PL_regsize);
 368  SSPUSHINT(*PL_reglastparen);
 369  SSPUSHINT(*PL_reglastcloseparen);
 370  SSPUSHPTR(PL_reginput);
 371  SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 372
 373  return retval;
 374 }
 375
 376 /* These are needed since we do not localize EVAL nodes: */
 377 #define REGCP_SET(cp)                                           \
 378  DEBUG_STATE_r(                                              \
 379    PerlIO_printf(Perl_debug_log,          \
 380     "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 381     (IV)PL_savestack_ix));                          \
 382  cp = PL_savestack_ix
 383
 384 #define REGCP_UNWIND(cp)                                        \
 385  DEBUG_STATE_r(                                              \
 386   if (cp != PL_savestack_ix)                   \
 387     PerlIO_printf(Perl_debug_log,          \
 388     "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 389     (IV)(cp), (IV)PL_savestack_ix));                \
 390  regcpblow(cp)
 391
 392 STATIC char *
 393 S_regcppop(pTHX_ const regexp *rex)
 394 {
 395  dVAR;
 396  UV i;
 397  char *input;
 398  GET_RE_DEBUG_FLAGS_DECL;
 399
 400  PERL_ARGS_ASSERT_REGCPPOP;
 401
 402  /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 403  i = SSPOPUV;
 404  assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 405  i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 406  input = (char *) SSPOPPTR;
 407  *PL_reglastcloseparen = SSPOPINT;
 408  *PL_reglastparen = SSPOPINT;
 409  PL_regsize = SSPOPINT;
 410  PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 411
 412  i -= REGCP_OTHER_ELEMS;
 413  /* Now restore the parentheses context. */
 414  for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 415   I32 tmps;
 416   U32 paren = (U32)SSPOPINT;
 417   PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 418   PL_regoffs[paren].start = SSPOPINT;
 419   tmps = SSPOPINT;
 420   if (paren <= *PL_reglastparen)
 421    PL_regoffs[paren].end = tmps;
 422   DEBUG_BUFFERS_r(
 423    PerlIO_printf(Perl_debug_log,
 424       "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 425       (UV)paren, (IV)PL_regoffs[paren].start,
 426       (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 427       (IV)PL_regoffs[paren].end,
 428       (paren > *PL_reglastparen ? "(no)" : ""));
 429   );
 430  }
 431  DEBUG_BUFFERS_r(
 432   if (*PL_reglastparen + 1 <= rex->nparens) {
 433    PerlIO_printf(Perl_debug_log,
 434       "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 435       (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 436   }
 437  );
 438 #if 1
 439  /* It would seem that the similar code in regtry()
 440  * already takes care of this, and in fact it is in
 441  * a better location to since this code can #if 0-ed out
 442  * but the code in regtry() is needed or otherwise tests
 443  * requiring null fields (pat.t#187 and split.t#{13,14}
 444  * (as of patchlevel 7877)  will fail.  Then again,
 445  * this code seems to be necessary or otherwise
 446  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 447  * --jhi updated by dapm */
 448  for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 449   if (i > PL_regsize)
 450    PL_regoffs[i].start = -1;
 451   PL_regoffs[i].end = -1;
 452  }
 453 #endif
 454  return input;
 455 }
 456
 457 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
 458
 459 /*
 460  * pregexec and friends
 461  */
 462
 463 #ifndef PERL_IN_XSUB_RE
 464 /*
 465  - pregexec - match a regexp against a string
 466  */
 467 I32
 468 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 469   char *strbeg, I32 minend, SV *screamer, U32 nosave)
 470 /* strend: pointer to null at end of string */
 471 /* strbeg: real beginning of string */
 472 /* minend: end of match must be >=minend after stringarg. */
 473 /* nosave: For optimizations. */
 474 {
 475  PERL_ARGS_ASSERT_PREGEXEC;
 476
 477  return
 478   regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 479      nosave ? 0 : REXEC_COPY_STR);
 480 }
 481 #endif
 482
 483 /*
 484  * Need to implement the following flags for reg_anch:
 485  *
 486  * USE_INTUIT_NOML  - Useful to call re_intuit_start() first
 487  * USE_INTUIT_ML
 488  * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
 489  * INTUIT_AUTORITATIVE_ML
 490  * INTUIT_ONCE_NOML  - Intuit can match in one location only.
 491  * INTUIT_ONCE_ML
 492  *
 493  * Another flag for this function: SECOND_TIME (so that float substrs
 494  * with giant delta may be not rechecked).
 495  */
 496
 497 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 498
 499 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 500    Otherwise, only SvCUR(sv) is used to get strbeg. */
 501
 502 /* XXXX We assume that strpos is strbeg unless sv. */
 503
 504 /* XXXX Some places assume that there is a fixed substring.
 505   An update may be needed if optimizer marks as "INTUITable"
 506   RExen without fixed substrings.  Similarly, it is assumed that
 507   lengths of all the strings are no more than minlen, thus they
 508   cannot come from lookahead.
 509   (Or minlen should take into account lookahead.)
 510   NOTE: Some of this comment is not correct. minlen does now take account
 511   of lookahead/behind. Further research is required. -- demerphq
 512
 513 */
 514
 515 /* A failure to find a constant substring means that there is no need to make
 516    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 517    finding a substring too deep into the string means that less calls to
 518    regtry() should be needed.
 519
 520    REx compiler's optimizer found 4 possible hints:
 521   a) Anchored substring;
 522   b) Fixed substring;
 523   c) Whether we are anchored (beginning-of-line or \G);
 524   d) First node (of those at offset 0) which may distinguish positions;
 525    We use a)b)d) and multiline-part of c), and try to find a position in the
 526    string which does not contradict any of them.
 527  */
 528
 529 /* Most of decisions we do here should have been done at compile time.
 530    The nodes of the REx which we used for the search should have been
 531    deleted from the finite automaton. */
 532
 533 char *
 534 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 535      char *strend, const U32 flags, re_scream_pos_data *data)
 536 {
 537  dVAR;
 538  struct regexp *const prog = (struct regexp *)SvANY(rx);
 539  register I32 start_shift = 0;
 540  /* Should be nonnegative! */
 541  register I32 end_shift   = 0;
 542  register char *s;
 543  register SV *check;
 544  char *strbeg;
 545  char *t;
 546  const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 547  I32 ml_anch;
 548  register char *other_last = NULL; /* other substr checked before this */
 549  char *check_at = NULL;  /* check substr found at this pos */
 550  const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 551  RXi_GET_DECL(prog,progi);
 552 #ifdef DEBUGGING
 553  const char * const i_strpos = strpos;
 554 #endif
 555  GET_RE_DEBUG_FLAGS_DECL;
 556
 557  PERL_ARGS_ASSERT_RE_INTUIT_START;
 558
 559  RX_MATCH_UTF8_set(rx,utf8_target);
 560
 561  if (RX_UTF8(rx)) {
 562   PL_reg_flags |= RF_utf8;
 563  }
 564  DEBUG_EXECUTE_r(
 565   debug_start_match(rx, utf8_target, strpos, strend,
 566    sv ? "Guessing start of match in sv for"
 567    : "Guessing start of match in string for");
 568    );
 569
 570  /* CHR_DIST() would be more correct here but it makes things slow. */
 571  if (prog->minlen > strend - strpos) {
 572   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 573        "String too short... [re_intuit_start]\n"));
 574   goto fail;
 575  }
 576
 577  strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 578  PL_regeol = strend;
 579  if (utf8_target) {
 580   if (!prog->check_utf8 && prog->check_substr)
 581    to_utf8_substr(prog);
 582   check = prog->check_utf8;
 583  } else {
 584   if (!prog->check_substr && prog->check_utf8)
 585    to_byte_substr(prog);
 586   check = prog->check_substr;
 587  }
 588  if (check == &PL_sv_undef) {
 589   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 590     "Non-utf8 string cannot match utf8 check string\n"));
 591   goto fail;
 592  }
 593  if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
 594   ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 595      || ( (prog->extflags & RXf_ANCH_BOL)
 596       && !multiline ) ); /* Check after \n? */
 597
 598   if (!ml_anch) {
 599   if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 600     && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 601    /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 602    && sv && !SvROK(sv)
 603    && (strpos != strbeg)) {
 604    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 605    goto fail;
 606   }
 607   if (prog->check_offset_min == prog->check_offset_max &&
 608    !(prog->extflags & RXf_CANY_SEEN)) {
 609    /* Substring at constant offset from beg-of-str... */
 610    I32 slen;
 611
 612    s = HOP3c(strpos, prog->check_offset_min, strend);
 613
 614    if (SvTAIL(check)) {
 615     slen = SvCUR(check); /* >= 1 */
 616
 617     if ( strend - s > slen || strend - s < slen - 1
 618      || (strend - s == slen && strend[-1] != '\n')) {
 619      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 620      goto fail_finish;
 621     }
 622     /* Now should match s[0..slen-2] */
 623     slen--;
 624     if (slen && (*SvPVX_const(check) != *s
 625        || (slen > 1
 626         && memNE(SvPVX_const(check), s, slen)))) {
 627     report_neq:
 628      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 629      goto fail_finish;
 630     }
 631    }
 632    else if (*SvPVX_const(check) != *s
 633      || ((slen = SvCUR(check)) > 1
 634       && memNE(SvPVX_const(check), s, slen)))
 635     goto report_neq;
 636    check_at = s;
 637    goto success_at_start;
 638   }
 639   }
 640   /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 641   s = strpos;
 642   start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 643   end_shift = prog->check_end_shift;
 644
 645   if (!ml_anch) {
 646    const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 647           - (SvTAIL(check) != 0);
 648    const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 649
 650    if (end_shift < eshift)
 651     end_shift = eshift;
 652   }
 653  }
 654  else {    /* Can match at random position */
 655   ml_anch = 0;
 656   s = strpos;
 657   start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 658   end_shift = prog->check_end_shift;
 659
 660   /* end shift should be non negative here */
 661  }
 662
 663 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
 664  if (end_shift < 0)
 665   Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 666     (IV)end_shift, RX_PRECOMP(prog));
 667 #endif
 668
 669   restart:
 670  /* Find a possible match in the region s..strend by looking for
 671  the "check" substring in the region corrected by start/end_shift. */
 672
 673  {
 674   I32 srch_start_shift = start_shift;
 675   I32 srch_end_shift = end_shift;
 676   if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 677    srch_end_shift -= ((strbeg - s) - srch_start_shift);
 678    srch_start_shift = strbeg - s;
 679   }
 680  DEBUG_OPTIMISE_MORE_r({
 681   PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 682    (IV)prog->check_offset_min,
 683    (IV)srch_start_shift,
 684    (IV)srch_end_shift,
 685    (IV)prog->check_end_shift);
 686  });
 687
 688  if (flags & REXEC_SCREAM) {
 689   I32 p = -1;   /* Internal iterator of scream. */
 690   I32 * const pp = data ? data->scream_pos : &p;
 691
 692   if (PL_screamfirst[BmRARE(check)] >= 0
 693    || ( BmRARE(check) == '\n'
 694     && (BmPREVIOUS(check) == SvCUR(check) - 1)
 695     && SvTAIL(check) ))
 696    s = screaminstr(sv, check,
 697        srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 698   else
 699    goto fail_finish;
 700   /* we may be pointing at the wrong string */
 701   if (s && RXp_MATCH_COPIED(prog))
 702    s = strbeg + (s - SvPVX_const(sv));
 703   if (data)
 704    *data->scream_olds = s;
 705  }
 706  else {
 707   U8* start_point;
 708   U8* end_point;
 709   if (prog->extflags & RXf_CANY_SEEN) {
 710    start_point= (U8*)(s + srch_start_shift);
 711    end_point= (U8*)(strend - srch_end_shift);
 712   } else {
 713    start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 714    end_point= HOP3(strend, -srch_end_shift, strbeg);
 715   }
 716   DEBUG_OPTIMISE_MORE_r({
 717    PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 718     (int)(end_point - start_point),
 719     (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 720     start_point);
 721   });
 722
 723   s = fbm_instr( start_point, end_point,
 724      check, multiline ? FBMrf_MULTILINE : 0);
 725  }
 726  }
 727  /* Update the count-of-usability, remove useless subpatterns,
 728   unshift s.  */
 729
 730  DEBUG_EXECUTE_r({
 731   RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 732    SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 733   PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 734       (s ? "Found" : "Did not find"),
 735    (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 736     ? "anchored" : "floating"),
 737    quoted,
 738    RE_SV_TAIL(check),
 739    (s ? " at offset " : "...\n") );
 740  });
 741
 742  if (!s)
 743   goto fail_finish;
 744  /* Finish the diagnostic message */
 745  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 746
 747  /* XXX dmq: first branch is for positive lookbehind...
 748  Our check string is offset from the beginning of the pattern.
 749  So we need to do any stclass tests offset forward from that
 750  point. I think. :-(
 751  */
 752
 753
 754
 755  check_at=s;
 756
 757
 758  /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 759  Start with the other substr.
 760  XXXX no SCREAM optimization yet - and a very coarse implementation
 761  XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 762     *always* match.  Probably should be marked during compile...
 763  Probably it is right to do no SCREAM here...
 764  */
 765
 766  if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 767     : (prog->float_substr && prog->anchored_substr))
 768  {
 769   /* Take into account the "other" substring. */
 770   /* XXXX May be hopelessly wrong for UTF... */
 771   if (!other_last)
 772    other_last = strpos;
 773   if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 774   do_other_anchored:
 775    {
 776     char * const last = HOP3c(s, -start_shift, strbeg);
 777     char *last1, *last2;
 778     char * const saved_s = s;
 779     SV* must;
 780
 781     t = s - prog->check_offset_max;
 782     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 783      && (!utf8_target
 784       || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 785        && t > strpos)))
 786      NOOP;
 787     else
 788      t = strpos;
 789     t = HOP3c(t, prog->anchored_offset, strend);
 790     if (t < other_last) /* These positions already checked */
 791      t = other_last;
 792     last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 793     if (last < last1)
 794      last1 = last;
 795     /* XXXX It is not documented what units *_offsets are in.
 796     We assume bytes, but this is clearly wrong.
 797     Meaning this code needs to be carefully reviewed for errors.
 798     dmq.
 799     */
 800
 801     /* On end-of-str: see comment below. */
 802     must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 803     if (must == &PL_sv_undef) {
 804      s = (char*)NULL;
 805      DEBUG_r(must = prog->anchored_utf8); /* for debug */
 806     }
 807     else
 808      s = fbm_instr(
 809       (unsigned char*)t,
 810       HOP3(HOP3(last1, prog->anchored_offset, strend)
 811         + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 812       must,
 813       multiline ? FBMrf_MULTILINE : 0
 814      );
 815     DEBUG_EXECUTE_r({
 816      RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 817       SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 818      PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 819       (s ? "Found" : "Contradicts"),
 820       quoted, RE_SV_TAIL(must));
 821     });
 822
 823
 824     if (!s) {
 825      if (last1 >= last2) {
 826       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 827             ", giving up...\n"));
 828       goto fail_finish;
 829      }
 830      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 831       ", trying floating at offset %ld...\n",
 832       (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 833      other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 834      s = HOP3c(last, 1, strend);
 835      goto restart;
 836     }
 837     else {
 838      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 839       (long)(s - i_strpos)));
 840      t = HOP3c(s, -prog->anchored_offset, strbeg);
 841      other_last = HOP3c(s, 1, strend);
 842      s = saved_s;
 843      if (t == strpos)
 844       goto try_at_start;
 845      goto try_at_offset;
 846     }
 847    }
 848   }
 849   else {  /* Take into account the floating substring. */
 850    char *last, *last1;
 851    char * const saved_s = s;
 852    SV* must;
 853
 854    t = HOP3c(s, -start_shift, strbeg);
 855    last1 = last =
 856     HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 857    if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 858     last = HOP3c(t, prog->float_max_offset, strend);
 859    s = HOP3c(t, prog->float_min_offset, strend);
 860    if (s < other_last)
 861     s = other_last;
 862  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 863    must = utf8_target ? prog->float_utf8 : prog->float_substr;
 864    /* fbm_instr() takes into account exact value of end-of-str
 865    if the check is SvTAIL(ed).  Since false positives are OK,
 866    and end-of-str is not later than strend we are OK. */
 867    if (must == &PL_sv_undef) {
 868     s = (char*)NULL;
 869     DEBUG_r(must = prog->float_utf8); /* for debug message */
 870    }
 871    else
 872     s = fbm_instr((unsigned char*)s,
 873        (unsigned char*)last + SvCUR(must)
 874         - (SvTAIL(must)!=0),
 875        must, multiline ? FBMrf_MULTILINE : 0);
 876    DEBUG_EXECUTE_r({
 877     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 878      SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 879     PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 880      (s ? "Found" : "Contradicts"),
 881      quoted, RE_SV_TAIL(must));
 882    });
 883    if (!s) {
 884     if (last1 == last) {
 885      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 886            ", giving up...\n"));
 887      goto fail_finish;
 888     }
 889     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 890      ", trying anchored starting at offset %ld...\n",
 891      (long)(saved_s + 1 - i_strpos)));
 892     other_last = last;
 893     s = HOP3c(t, 1, strend);
 894     goto restart;
 895    }
 896    else {
 897     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 898      (long)(s - i_strpos)));
 899     other_last = s; /* Fix this later. --Hugo */
 900     s = saved_s;
 901     if (t == strpos)
 902      goto try_at_start;
 903     goto try_at_offset;
 904    }
 905   }
 906  }
 907
 908
 909  t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 910
 911  DEBUG_OPTIMISE_MORE_r(
 912   PerlIO_printf(Perl_debug_log,
 913    "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 914    (IV)prog->check_offset_min,
 915    (IV)prog->check_offset_max,
 916    (IV)(s-strpos),
 917    (IV)(t-strpos),
 918    (IV)(t-s),
 919    (IV)(strend-strpos)
 920   )
 921  );
 922
 923  if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 924   && (!utf8_target
 925    || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 926     && t > strpos)))
 927  {
 928   /* Fixed substring is found far enough so that the match
 929   cannot start at strpos. */
 930  try_at_offset:
 931   if (ml_anch && t[-1] != '\n') {
 932    /* Eventually fbm_*() should handle this, but often
 933    anchored_offset is not 0, so this check will not be wasted. */
 934    /* XXXX In the code below we prefer to look for "^" even in
 935    presence of anchored substrings.  And we search even
 936    beyond the found float position.  These pessimizations
 937    are historical artefacts only.  */
 938   find_anchor:
 939    while (t < strend - prog->minlen) {
 940     if (*t == '\n') {
 941      if (t < check_at - prog->check_offset_min) {
 942       if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
 943        /* Since we moved from the found position,
 944        we definitely contradict the found anchored
 945        substr.  Due to the above check we do not
 946        contradict "check" substr.
 947        Thus we can arrive here only if check substr
 948        is float.  Redo checking for "other"=="fixed".
 949        */
 950        strpos = t + 1;
 951        DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 952         PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 953        goto do_other_anchored;
 954       }
 955       /* We don't contradict the found floating substring. */
 956       /* XXXX Why not check for STCLASS? */
 957       s = t + 1;
 958       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 959        PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 960       goto set_useful;
 961      }
 962      /* Position contradicts check-string */
 963      /* XXXX probably better to look for check-string
 964      than for "\n", so one should lower the limit for t? */
 965      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 966       PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 967      other_last = strpos = s = t + 1;
 968      goto restart;
 969     }
 970     t++;
 971    }
 972    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 973       PL_colors[0], PL_colors[1]));
 974    goto fail_finish;
 975   }
 976   else {
 977    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 978       PL_colors[0], PL_colors[1]));
 979   }
 980   s = t;
 981  set_useful:
 982   ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
 983  }
 984  else {
 985   /* The found string does not prohibit matching at strpos,
 986   - no optimization of calling REx engine can be performed,
 987   unless it was an MBOL and we are not after MBOL,
 988   or a future STCLASS check will fail this. */
 989  try_at_start:
 990   /* Even in this situation we may use MBOL flag if strpos is offset
 991   wrt the start of the string. */
 992   if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 993    && (strpos != strbeg) && strpos[-1] != '\n'
 994    /* May be due to an implicit anchor of m{.*foo}  */
 995    && !(prog->intflags & PREGf_IMPLICIT))
 996   {
 997    t = strpos;
 998    goto find_anchor;
 999   }
1000   DEBUG_EXECUTE_r( if (ml_anch)
1001    PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1002       (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1003   );
1004  success_at_start:
1005   if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1006    && (utf8_target ? (
1007     prog->check_utf8  /* Could be deleted already */
1008     && --BmUSEFUL(prog->check_utf8) < 0
1009     && (prog->check_utf8 == prog->float_utf8)
1010    ) : (
1011     prog->check_substr  /* Could be deleted already */
1012     && --BmUSEFUL(prog->check_substr) < 0
1013     && (prog->check_substr == prog->float_substr)
1014    )))
1015   {
1016    /* If flags & SOMETHING - do not do it many times on the same match */
1017    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1018    /* XXX Does the destruction order has to change with utf8_target? */
1019    SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1020    SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1021    prog->check_substr = prog->check_utf8 = NULL; /* disable */
1022    prog->float_substr = prog->float_utf8 = NULL; /* clear */
1023    check = NULL;   /* abort */
1024    s = strpos;
1025    /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1026      see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1027    if (prog->intflags & PREGf_IMPLICIT)
1028     prog->extflags &= ~RXf_ANCH_MBOL;
1029    /* XXXX This is a remnant of the old implementation.  It
1030      looks wasteful, since now INTUIT can use many
1031      other heuristics. */
1032    prog->extflags &= ~RXf_USE_INTUIT;
1033    /* XXXX What other flags might need to be cleared in this branch? */
1034   }
1035   else
1036    s = strpos;
1037  }
1038
1039  /* Last resort... */
1040  /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1041  /* trie stclasses are too expensive to use here, we are better off to
1042  leave it to regmatch itself */
1043  if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1044   /* minlen == 0 is possible if regstclass is \b or \B,
1045   and the fixed substr is ''$.
1046   Since minlen is already taken into account, s+1 is before strend;
1047   accidentally, minlen >= 1 guaranties no false positives at s + 1
1048   even for \b or \B.  But (minlen? 1 : 0) below assumes that
1049   regstclass does not come from lookahead...  */
1050   /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1051   This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1052   const U8* const str = (U8*)STRING(progi->regstclass);
1053   const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1054      ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1055      : 1);
1056   char * endpos;
1057   if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1058    endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1059   else if (prog->float_substr || prog->float_utf8)
1060    endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1061   else
1062    endpos= strend;
1063
1064   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1065          (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1066
1067   t = s;
1068   s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1069   if (!s) {
1070 #ifdef DEBUGGING
1071    const char *what = NULL;
1072 #endif
1073    if (endpos == strend) {
1074     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1075         "Could not match STCLASS...\n") );
1076     goto fail;
1077    }
1078    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1079         "This position contradicts STCLASS...\n") );
1080    if ((prog->extflags & RXf_ANCH) && !ml_anch)
1081     goto fail;
1082    /* Contradict one of substrings */
1083    if (prog->anchored_substr || prog->anchored_utf8) {
1084     if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1085      DEBUG_EXECUTE_r( what = "anchored" );
1086     hop_and_restart:
1087      s = HOP3c(t, 1, strend);
1088      if (s + start_shift + end_shift > strend) {
1089       /* XXXX Should be taken into account earlier? */
1090       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1091            "Could not match STCLASS...\n") );
1092       goto fail;
1093      }
1094      if (!check)
1095       goto giveup;
1096      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1097         "Looking for %s substr starting at offset %ld...\n",
1098         what, (long)(s + start_shift - i_strpos)) );
1099      goto restart;
1100     }
1101     /* Have both, check_string is floating */
1102     if (t + start_shift >= check_at) /* Contradicts floating=check */
1103      goto retry_floating_check;
1104     /* Recheck anchored substring, but not floating... */
1105     s = check_at;
1106     if (!check)
1107      goto giveup;
1108     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1109       "Looking for anchored substr starting at offset %ld...\n",
1110       (long)(other_last - i_strpos)) );
1111     goto do_other_anchored;
1112    }
1113    /* Another way we could have checked stclass at the
1114    current position only: */
1115    if (ml_anch) {
1116     s = t = t + 1;
1117     if (!check)
1118      goto giveup;
1119     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1120       "Looking for /%s^%s/m starting at offset %ld...\n",
1121       PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1122     goto try_at_offset;
1123    }
1124    if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1125     goto fail;
1126    /* Check is floating substring. */
1127   retry_floating_check:
1128    t = check_at - start_shift;
1129    DEBUG_EXECUTE_r( what = "floating" );
1130    goto hop_and_restart;
1131   }
1132   if (t != s) {
1133    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1134       "By STCLASS: moving %ld --> %ld\n",
1135         (long)(t - i_strpos), (long)(s - i_strpos))
1136     );
1137   }
1138   else {
1139    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1140         "Does not contradict STCLASS...\n");
1141     );
1142   }
1143  }
1144   giveup:
1145  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1146       PL_colors[4], (check ? "Guessed" : "Giving up"),
1147       PL_colors[5], (long)(s - i_strpos)) );
1148  return s;
1149
1150   fail_finish:    /* Substring not found */
1151  if (prog->check_substr || prog->check_utf8)  /* could be removed already */
1152   BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1153   fail:
1154  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1155       PL_colors[4], PL_colors[5]));
1156  return NULL;
1157 }
1158
1159 #define DECL_TRIE_TYPE(scan) \
1160  const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1161      trie_type = (scan->flags != EXACT) \
1162        ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1163        : (utf8_target ? trie_utf8 : trie_plain)
1164
1165 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1166 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1167  switch (trie_type) {                                                    \
1168  case trie_utf8_fold:                                                    \
1169   if ( foldlen>0 ) {                                                  \
1170    uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1171    foldlen -= len;                                                 \
1172    uscan += len;                                                   \
1173    len=0;                                                          \
1174   } else {                                                            \
1175    uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1176    uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1177    foldlen -= UNISKIP( uvc );                                      \
1178    uscan = foldbuf + UNISKIP( uvc );                               \
1179   }                                                                   \
1180   break;                                                              \
1181  case trie_latin_utf8_fold:                                              \
1182   if ( foldlen>0 ) {                                                  \
1183    uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1184    foldlen -= len;                                                 \
1185    uscan += len;                                                   \
1186    len=0;                                                          \
1187   } else {                                                            \
1188    len = 1;                                                        \
1189    uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1190    foldlen -= UNISKIP( uvc );                                      \
1191    uscan = foldbuf + UNISKIP( uvc );                               \
1192   }                                                                   \
1193   break;                                                              \
1194  case trie_utf8:                                                         \
1195   uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1196   break;                                                              \
1197  case trie_plain:                                                        \
1198   uvc = (UV)*uc;                                                      \
1199   len = 1;                                                            \
1200  }                                                                       \
1201  if (uvc < 256) {                                                        \
1202   charid = trie->charmap[ uvc ];                                      \
1203  }                                                                       \
1204  else {                                                                  \
1205   charid = 0;                                                         \
1206   if (widecharmap) {                                                  \
1207    SV** const svpp = hv_fetch(widecharmap,                         \
1208       (char*)&uvc, sizeof(UV), 0);                        \
1209    if (svpp)                                                       \
1210     charid = (U16)SvIV(*svpp);                                  \
1211   }                                                                   \
1212  }                                                                       \
1213 } STMT_END
1214
1215 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1216 STMT_START {                                              \
1217  while (s <= e) {                                      \
1218   if ( (CoNd)                                       \
1219    && (ln == 1 || folder(s, pat_string, ln))    \
1220    && (!reginfo || regtry(reginfo, &s)) )       \
1221    goto got_it;                                  \
1222   s++;                                              \
1223  }                                                     \
1224 } STMT_END
1225
1226 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1227 STMT_START {                                          \
1228  while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1229   CoDe                                          \
1230   s += uskip;                                   \
1231  }                                                 \
1232 } STMT_END
1233
1234 #define REXEC_FBC_SCAN(CoDe)                          \
1235 STMT_START {                                          \
1236  while (s < strend) {                              \
1237   CoDe                                          \
1238   s++;                                          \
1239  }                                                 \
1240 } STMT_END
1241
1242 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1243 REXEC_FBC_UTF8_SCAN(                                  \
1244  if (CoNd) {                                       \
1245   if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1246    goto got_it;                              \
1247   else                                          \
1248    tmp = doevery;                            \
1249  }                                                 \
1250  else                                              \
1251   tmp = 1;                                      \
1252 )
1253
1254 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1255 REXEC_FBC_SCAN(                                       \
1256  if (CoNd) {                                       \
1257   if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1258    goto got_it;                              \
1259   else                                          \
1260    tmp = doevery;                            \
1261  }                                                 \
1262  else                                              \
1263   tmp = 1;                                      \
1264 )
1265
1266 #define REXEC_FBC_TRYIT               \
1267 if ((!reginfo || regtry(reginfo, &s))) \
1268  goto got_it
1269
1270 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1271  if (utf8_target) {                                             \
1272   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1273  }                                                          \
1274  else {                                                     \
1275   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1276  }
1277
1278 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1279  if (utf8_target) {                                             \
1280   UtFpReLoAd;                                            \
1281   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1282  }                                                          \
1283  else {                                                     \
1284   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1285  }
1286
1287 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1288  PL_reg_flags |= RF_tainted;                                \
1289  if (utf8_target) {                                             \
1290   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1291  }                                                          \
1292  else {                                                     \
1293   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1294  }
1295
1296 #define DUMP_EXEC_POS(li,s,doutf8) \
1297  dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1298
1299
1300 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1301   tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1302   tmp = TEST_NON_UTF8(tmp);                                              \
1303   REXEC_FBC_UTF8_SCAN(                                                   \
1304    if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1305     tmp = !tmp;                                                    \
1306     IF_SUCCESS;                                                    \
1307    }                                                                  \
1308    else {                                                             \
1309     IF_FAIL;                                                       \
1310    }                                                                  \
1311   );                                                                     \
1312
1313 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1314   if (s == PL_bostr) {                                                   \
1315    tmp = '\n';                                                        \
1316   }                                                                      \
1317   else {                                                                 \
1318    U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);                 \
1319    tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1320   }                                                                      \
1321   tmp = TeSt1_UtF8;                                                      \
1322   LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1323   REXEC_FBC_UTF8_SCAN(                                                   \
1324    if (tmp == ! (TeSt2_UtF8)) { \
1325     tmp = !tmp;                                                    \
1326     IF_SUCCESS;                                                    \
1327    }                                                                  \
1328    else {                                                             \
1329     IF_FAIL;                                                       \
1330    }                                                                  \
1331   );                                                                     \
1332
1333 /* The only difference between the BOUND and NBOUND cases is that
1334  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1335  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1336  * with the other one being empty */
1337 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1338  FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1339
1340 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1341  FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1342
1343 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1344  FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1345
1346 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1347  FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1348
1349
1350 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1351  * be passed in completely with the variable name being tested, which isn't
1352  * such a clean interface, but this is easier to read than it was before.  We
1353  * are looking for the boundary (or non-boundary between a word and non-word
1354  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1355  * must be different.  Find the "wordness" of the character just prior to this
1356  * one, and compare it with the wordness of this one.  If they differ, we have
1357  * a boundary.  At the beginning of the string, pretend that the previous
1358  * character was a new-line */
1359 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1360  if (utf8_target) {                                                         \
1361     UTF8_CODE \
1362  }                                                                          \
1363  else {  /* Not utf8 */                                                     \
1364   tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';                         \
1365   tmp = TEST_NON_UTF8(tmp);                                              \
1366   REXEC_FBC_SCAN(                                                        \
1367    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1368     tmp = !tmp;                                                    \
1369     IF_SUCCESS;                                                    \
1370    }                                                                  \
1371    else {                                                             \
1372     IF_FAIL;                                                       \
1373    }                                                                  \
1374   );                                                                     \
1375  }                                                                          \
1376  if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))           \
1377   goto got_it;
1378
1379 /* We know what class REx starts with.  Try to find this position... */
1380 /* if reginfo is NULL, its a dryrun */
1381 /* annoyingly all the vars in this routine have different names from their counterparts
1382    in regmatch. /grrr */
1383
1384 STATIC char *
1385 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1386  const char *strend, regmatch_info *reginfo)
1387 {
1388   dVAR;
1389   const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1390   char *pat_string;   /* The pattern's exactish string */
1391   char *pat_end;     /* ptr to end char of pat_string */
1392   re_fold_t folder; /* Function for computing non-utf8 folds */
1393   const U8 *fold_array;   /* array for folding ords < 256 */
1394   STRLEN ln;
1395   STRLEN lnc;
1396   register STRLEN uskip;
1397   U8 c1;
1398   U8 c2;
1399   char *e;
1400   register I32 tmp = 1; /* Scratch variable? */
1401   register const bool utf8_target = PL_reg_match_utf8;
1402   UV utf8_fold_flags = 0;
1403   RXi_GET_DECL(prog,progi);
1404
1405   PERL_ARGS_ASSERT_FIND_BYCLASS;
1406
1407   /* We know what class it must start with. */
1408   switch (OP(c)) {
1409   case ANYOFV:
1410   case ANYOF:
1411    if (utf8_target || OP(c) == ANYOFV) {
1412     STRLEN inclasslen = strend - s;
1413     REXEC_FBC_UTF8_CLASS_SCAN(
1414       reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1415    }
1416    else {
1417     REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1418    }
1419    break;
1420   case CANY:
1421    REXEC_FBC_SCAN(
1422     if (tmp && (!reginfo || regtry(reginfo, &s)))
1423      goto got_it;
1424     else
1425      tmp = doevery;
1426    );
1427    break;
1428
1429   case EXACTFA:
1430    if (UTF_PATTERN || utf8_target) {
1431     utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1432     goto do_exactf_utf8;
1433    }
1434    fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1435    folder = foldEQ_latin1;     /* /a, except the sharp s one which */
1436    goto do_exactf_non_utf8;     /* isn't dealt with by these */
1437
1438   case EXACTFU:
1439    if (UTF_PATTERN || utf8_target) {
1440     utf8_fold_flags = 0;
1441     goto do_exactf_utf8;
1442    }
1443    fold_array = PL_fold_latin1;
1444    folder = foldEQ_latin1;
1445    /* XXX This uses the full utf8 fold because if the pattern contains
1446    * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string.
1447    * There could be a new node type, say EXACTFU_SS, which is
1448    * generated by regcomp only if there is an 'ss', and then every
1449    * other case could goto do_exactf_non_utf8;*/
1450    goto do_exactf_utf8;
1451
1452   case EXACTF:
1453    if (UTF_PATTERN || utf8_target) {
1454     utf8_fold_flags = 0;
1455     goto do_exactf_utf8;
1456    }
1457    fold_array = PL_fold;
1458    folder = foldEQ;
1459    goto do_exactf_non_utf8;
1460
1461   case EXACTFL:
1462    if (UTF_PATTERN || utf8_target) {
1463     utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1464     goto do_exactf_utf8;
1465    }
1466    fold_array = PL_fold_locale;
1467    folder = foldEQ_locale;
1468
1469    /* FALL THROUGH */
1470
1471   do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */
1472
1473    /* The idea in the non-utf8 EXACTF* cases is to first find the
1474    * first character of the EXACTF* node and then, if necessary,
1475    * case-insensitively compare the full text of the node.  c1 is the
1476    * first character.  c2 is its fold.  This logic will not work for
1477    * Unicode semantics and the german sharp ss, which hence should
1478    * not be compiled into a node that gets here. */
1479    pat_string = STRING(c);
1480    ln  = STR_LEN(c); /* length to match in octets/bytes */
1481
1482    e = HOP3c(strend, -((I32)ln), s);
1483
1484    if (!reginfo && e < s) {
1485     e = s;   /* Due to minlen logic of intuit() */
1486    }
1487
1488    c1 = *pat_string;
1489    c2 = fold_array[c1];
1490    if (c1 == c2) { /* If char and fold are the same */
1491     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1492    }
1493    else {
1494     REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1495    }
1496    break;
1497
1498   do_exactf_utf8:
1499
1500    /* If one of the operands is in utf8, we can't use the simpler
1501    * folding above, due to the fact that many different characters
1502    * can have the same fold, or portion of a fold, or different-
1503    * length fold */
1504    pat_string = STRING(c);
1505    ln  = STR_LEN(c); /* length to match in octets/bytes */
1506    pat_end = pat_string + ln;
1507    lnc = (UTF_PATTERN) /* length to match in characters */
1508      ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1509      : ln;
1510
1511    e = HOP3c(strend, -((I32)lnc), s);
1512
1513    if (!reginfo && e < s) {
1514     e = s;   /* Due to minlen logic of intuit() */
1515    }
1516
1517    while (s <= e) {
1518     char *my_strend= (char *)strend;
1519     if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1520      pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1521      && (!reginfo || regtry(reginfo, &s)) )
1522     {
1523      goto got_it;
1524     }
1525     s += UTF8SKIP(s);
1526    }
1527    break;
1528   case BOUNDL:
1529    PL_reg_flags |= RF_tainted;
1530    FBC_BOUND(isALNUM_LC,
1531      isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1532      isALNUM_LC_utf8((U8*)s));
1533    break;
1534   case NBOUNDL:
1535    PL_reg_flags |= RF_tainted;
1536    FBC_NBOUND(isALNUM_LC,
1537      isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1538      isALNUM_LC_utf8((U8*)s));
1539    break;
1540   case BOUND:
1541    FBC_BOUND(isWORDCHAR,
1542      isALNUM_uni(tmp),
1543      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1544    break;
1545   case BOUNDA:
1546    FBC_BOUND_NOLOAD(isWORDCHAR_A,
1547        isWORDCHAR_A(tmp),
1548        isWORDCHAR_A((U8*)s));
1549    break;
1550   case NBOUND:
1551    FBC_NBOUND(isWORDCHAR,
1552      isALNUM_uni(tmp),
1553      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1554    break;
1555   case NBOUNDA:
1556    FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1557        isWORDCHAR_A(tmp),
1558        isWORDCHAR_A((U8*)s));
1559    break;
1560   case BOUNDU:
1561    FBC_BOUND(isWORDCHAR_L1,
1562      isALNUM_uni(tmp),
1563      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1564    break;
1565   case NBOUNDU:
1566    FBC_NBOUND(isWORDCHAR_L1,
1567      isALNUM_uni(tmp),
1568      cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1569    break;
1570   case ALNUML:
1571    REXEC_FBC_CSCAN_TAINT(
1572     isALNUM_LC_utf8((U8*)s),
1573     isALNUM_LC(*s)
1574    );
1575    break;
1576   case ALNUMU:
1577    REXEC_FBC_CSCAN_PRELOAD(
1578     LOAD_UTF8_CHARCLASS_ALNUM(),
1579     swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1580     isWORDCHAR_L1((U8) *s)
1581    );
1582    break;
1583   case ALNUM:
1584    REXEC_FBC_CSCAN_PRELOAD(
1585     LOAD_UTF8_CHARCLASS_ALNUM(),
1586     swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1587     isWORDCHAR((U8) *s)
1588    );
1589    break;
1590   case ALNUMA:
1591    /* Don't need to worry about utf8, as it can match only a single
1592    * byte invariant character */
1593    REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1594    break;
1595   case NALNUMU:
1596    REXEC_FBC_CSCAN_PRELOAD(
1597     LOAD_UTF8_CHARCLASS_ALNUM(),
1598     !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1599     ! isWORDCHAR_L1((U8) *s)
1600    );
1601    break;
1602   case NALNUM:
1603    REXEC_FBC_CSCAN_PRELOAD(
1604     LOAD_UTF8_CHARCLASS_ALNUM(),
1605     !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1606     ! isALNUM(*s)
1607    );
1608    break;
1609   case NALNUMA:
1610    REXEC_FBC_CSCAN(
1611     !isWORDCHAR_A(*s),
1612     !isWORDCHAR_A(*s)
1613    );
1614    break;
1615   case NALNUML:
1616    REXEC_FBC_CSCAN_TAINT(
1617     !isALNUM_LC_utf8((U8*)s),
1618     !isALNUM_LC(*s)
1619    );
1620    break;
1621   case SPACEU:
1622    REXEC_FBC_CSCAN_PRELOAD(
1623     LOAD_UTF8_CHARCLASS_SPACE(),
1624     *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1625     isSPACE_L1((U8) *s)
1626    );
1627    break;
1628   case SPACE:
1629    REXEC_FBC_CSCAN_PRELOAD(
1630     LOAD_UTF8_CHARCLASS_SPACE(),
1631     *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1632     isSPACE((U8) *s)
1633    );
1634    break;
1635   case SPACEA:
1636    /* Don't need to worry about utf8, as it can match only a single
1637    * byte invariant character */
1638    REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1639    break;
1640   case SPACEL:
1641    REXEC_FBC_CSCAN_TAINT(
1642     isSPACE_LC_utf8((U8*)s),
1643     isSPACE_LC(*s)
1644    );
1645    break;
1646   case NSPACEU:
1647    REXEC_FBC_CSCAN_PRELOAD(
1648     LOAD_UTF8_CHARCLASS_SPACE(),
1649     !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1650     ! isSPACE_L1((U8) *s)
1651    );
1652    break;
1653   case NSPACE:
1654    REXEC_FBC_CSCAN_PRELOAD(
1655     LOAD_UTF8_CHARCLASS_SPACE(),
1656     !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1657     ! isSPACE((U8) *s)
1658    );
1659    break;
1660   case NSPACEA:
1661    REXEC_FBC_CSCAN(
1662     !isSPACE_A(*s),
1663     !isSPACE_A(*s)
1664    );
1665    break;
1666   case NSPACEL:
1667    REXEC_FBC_CSCAN_TAINT(
1668     !isSPACE_LC_utf8((U8*)s),
1669     !isSPACE_LC(*s)
1670    );
1671    break;
1672   case DIGIT:
1673    REXEC_FBC_CSCAN_PRELOAD(
1674     LOAD_UTF8_CHARCLASS_DIGIT(),
1675     swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1676     isDIGIT(*s)
1677    );
1678    break;
1679   case DIGITA:
1680    /* Don't need to worry about utf8, as it can match only a single
1681    * byte invariant character */
1682    REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1683    break;
1684   case DIGITL:
1685    REXEC_FBC_CSCAN_TAINT(
1686     isDIGIT_LC_utf8((U8*)s),
1687     isDIGIT_LC(*s)
1688    );
1689    break;
1690   case NDIGIT:
1691    REXEC_FBC_CSCAN_PRELOAD(
1692     LOAD_UTF8_CHARCLASS_DIGIT(),
1693     !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1694     !isDIGIT(*s)
1695    );
1696    break;
1697   case NDIGITA:
1698    REXEC_FBC_CSCAN(
1699     !isDIGIT_A(*s),
1700     !isDIGIT_A(*s)
1701    );
1702    break;
1703   case NDIGITL:
1704    REXEC_FBC_CSCAN_TAINT(
1705     !isDIGIT_LC_utf8((U8*)s),
1706     !isDIGIT_LC(*s)
1707    );
1708    break;
1709   case LNBREAK:
1710    REXEC_FBC_CSCAN(
1711     is_LNBREAK_utf8(s),
1712     is_LNBREAK_latin1(s)
1713    );
1714    break;
1715   case VERTWS:
1716    REXEC_FBC_CSCAN(
1717     is_VERTWS_utf8(s),
1718     is_VERTWS_latin1(s)
1719    );
1720    break;
1721   case NVERTWS:
1722    REXEC_FBC_CSCAN(
1723     !is_VERTWS_utf8(s),
1724     !is_VERTWS_latin1(s)
1725    );
1726    break;
1727   case HORIZWS:
1728    REXEC_FBC_CSCAN(
1729     is_HORIZWS_utf8(s),
1730     is_HORIZWS_latin1(s)
1731    );
1732    break;
1733   case NHORIZWS:
1734    REXEC_FBC_CSCAN(
1735     !is_HORIZWS_utf8(s),
1736     !is_HORIZWS_latin1(s)
1737    );
1738    break;
1739   case AHOCORASICKC:
1740   case AHOCORASICK:
1741    {
1742     DECL_TRIE_TYPE(c);
1743     /* what trie are we using right now */
1744     reg_ac_data *aho
1745      = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1746     reg_trie_data *trie
1747      = (reg_trie_data*)progi->data->data[ aho->trie ];
1748     HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1749
1750     const char *last_start = strend - trie->minlen;
1751 #ifdef DEBUGGING
1752     const char *real_start = s;
1753 #endif
1754     STRLEN maxlen = trie->maxlen;
1755     SV *sv_points;
1756     U8 **points; /* map of where we were in the input string
1757         when reading a given char. For ASCII this
1758         is unnecessary overhead as the relationship
1759         is always 1:1, but for Unicode, especially
1760         case folded Unicode this is not true. */
1761     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1762     U8 *bitmap=NULL;
1763
1764
1765     GET_RE_DEBUG_FLAGS_DECL;
1766
1767     /* We can't just allocate points here. We need to wrap it in
1768     * an SV so it gets freed properly if there is a croak while
1769     * running the match */
1770     ENTER;
1771     SAVETMPS;
1772     sv_points=newSV(maxlen * sizeof(U8 *));
1773     SvCUR_set(sv_points,
1774      maxlen * sizeof(U8 *));
1775     SvPOK_on(sv_points);
1776     sv_2mortal(sv_points);
1777     points=(U8**)SvPV_nolen(sv_points );
1778     if ( trie_type != trie_utf8_fold
1779      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1780     {
1781      if (trie->bitmap)
1782       bitmap=(U8*)trie->bitmap;
1783      else
1784       bitmap=(U8*)ANYOF_BITMAP(c);
1785     }
1786     /* this is the Aho-Corasick algorithm modified a touch
1787     to include special handling for long "unknown char"
1788     sequences. The basic idea being that we use AC as long
1789     as we are dealing with a possible matching char, when
1790     we encounter an unknown char (and we have not encountered
1791     an accepting state) we scan forward until we find a legal
1792     starting char.
1793     AC matching is basically that of trie matching, except
1794     that when we encounter a failing transition, we fall back
1795     to the current states "fail state", and try the current char
1796     again, a process we repeat until we reach the root state,
1797     state 1, or a legal transition. If we fail on the root state
1798     then we can either terminate if we have reached an accepting
1799     state previously, or restart the entire process from the beginning
1800     if we have not.
1801
1802     */
1803     while (s <= last_start) {
1804      const U32 uniflags = UTF8_ALLOW_DEFAULT;
1805      U8 *uc = (U8*)s;
1806      U16 charid = 0;
1807      U32 base = 1;
1808      U32 state = 1;
1809      UV uvc = 0;
1810      STRLEN len = 0;
1811      STRLEN foldlen = 0;
1812      U8 *uscan = (U8*)NULL;
1813      U8 *leftmost = NULL;
1814 #ifdef DEBUGGING
1815      U32 accepted_word= 0;
1816 #endif
1817      U32 pointpos = 0;
1818
1819      while ( state && uc <= (U8*)strend ) {
1820       int failed=0;
1821       U32 word = aho->states[ state ].wordnum;
1822
1823       if( state==1 ) {
1824        if ( bitmap ) {
1825         DEBUG_TRIE_EXECUTE_r(
1826          if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1827           dump_exec_pos( (char *)uc, c, strend, real_start,
1828            (char *)uc, utf8_target );
1829           PerlIO_printf( Perl_debug_log,
1830            " Scanning for legal start char...\n");
1831          }
1832         );
1833         if (utf8_target) {
1834          while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1835           uc += UTF8SKIP(uc);
1836          }
1837         } else {
1838          while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1839           uc++;
1840          }
1841         }
1842         s= (char *)uc;
1843        }
1844        if (uc >(U8*)last_start) break;
1845       }
1846
1847       if ( word ) {
1848        U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1849        if (!leftmost || lpos < leftmost) {
1850         DEBUG_r(accepted_word=word);
1851         leftmost= lpos;
1852        }
1853        if (base==0) break;
1854
1855       }
1856       points[pointpos++ % maxlen]= uc;
1857       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1858            uscan, len, uvc, charid, foldlen,
1859            foldbuf, uniflags);
1860       DEBUG_TRIE_EXECUTE_r({
1861        dump_exec_pos( (char *)uc, c, strend, real_start,
1862         s,   utf8_target );
1863        PerlIO_printf(Perl_debug_log,
1864         " Charid:%3u CP:%4"UVxf" ",
1865         charid, uvc);
1866       });
1867
1868       do {
1869 #ifdef DEBUGGING
1870        word = aho->states[ state ].wordnum;
1871 #endif
1872        base = aho->states[ state ].trans.base;
1873
1874        DEBUG_TRIE_EXECUTE_r({
1875         if (failed)
1876          dump_exec_pos( (char *)uc, c, strend, real_start,
1877           s,   utf8_target );
1878         PerlIO_printf( Perl_debug_log,
1879          "%sState: %4"UVxf", word=%"UVxf,
1880          failed ? " Fail transition to " : "",
1881          (UV)state, (UV)word);
1882        });
1883        if ( base ) {
1884         U32 tmp;
1885         I32 offset;
1886         if (charid &&
1887          ( ((offset = base + charid
1888           - 1 - trie->uniquecharcount)) >= 0)
1889          && ((U32)offset < trie->lasttrans)
1890          && trie->trans[offset].check == state
1891          && (tmp=trie->trans[offset].next))
1892         {
1893          DEBUG_TRIE_EXECUTE_r(
1894           PerlIO_printf( Perl_debug_log," - legal\n"));
1895          state = tmp;
1896          break;
1897         }
1898         else {
1899          DEBUG_TRIE_EXECUTE_r(
1900           PerlIO_printf( Perl_debug_log," - fail\n"));
1901          failed = 1;
1902          state = aho->fail[state];
1903         }
1904        }
1905        else {
1906         /* we must be accepting here */
1907         DEBUG_TRIE_EXECUTE_r(
1908           PerlIO_printf( Perl_debug_log," - accepting\n"));
1909         failed = 1;
1910         break;
1911        }
1912       } while(state);
1913       uc += len;
1914       if (failed) {
1915        if (leftmost)
1916         break;
1917        if (!state) state = 1;
1918       }
1919      }
1920      if ( aho->states[ state ].wordnum ) {
1921       U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1922       if (!leftmost || lpos < leftmost) {
1923        DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1924        leftmost = lpos;
1925       }
1926      }
1927      if (leftmost) {
1928       s = (char*)leftmost;
1929       DEBUG_TRIE_EXECUTE_r({
1930        PerlIO_printf(
1931         Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1932         (UV)accepted_word, (IV)(s - real_start)
1933        );
1934       });
1935       if (!reginfo || regtry(reginfo, &s)) {
1936        FREETMPS;
1937        LEAVE;
1938        goto got_it;
1939       }
1940       s = HOPc(s,1);
1941       DEBUG_TRIE_EXECUTE_r({
1942        PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1943       });
1944      } else {
1945       DEBUG_TRIE_EXECUTE_r(
1946        PerlIO_printf( Perl_debug_log,"No match.\n"));
1947       break;
1948      }
1949     }
1950     FREETMPS;
1951     LEAVE;
1952    }
1953    break;
1954   default:
1955    Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1956    break;
1957   }
1958   return 0;
1959  got_it:
1960   return s;
1961 }
1962
1963
1964 /*
1965  - regexec_flags - match a regexp against a string
1966  */
1967 I32
1968 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1969    char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1970 /* strend: pointer to null at end of string */
1971 /* strbeg: real beginning of string */
1972 /* minend: end of match must be >=minend after stringarg. */
1973 /* data: May be used for some additional optimizations.
1974   Currently its only used, with a U32 cast, for transmitting
1975   the ganch offset when doing a /g match. This will change */
1976 /* nosave: For optimizations. */
1977 {
1978  dVAR;
1979  struct regexp *const prog = (struct regexp *)SvANY(rx);
1980  /*register*/ char *s;
1981  register regnode *c;
1982  /*register*/ char *startpos = stringarg;
1983  I32 minlen;  /* must match at least this many chars */
1984  I32 dontbother = 0; /* how many characters not to try at end */
1985  I32 end_shift = 0;   /* Same for the end. */  /* CC */
1986  I32 scream_pos = -1;  /* Internal iterator of scream. */
1987  char *scream_olds = NULL;
1988  const bool utf8_target = cBOOL(DO_UTF8(sv));
1989  I32 multiline;
1990  RXi_GET_DECL(prog,progi);
1991  regmatch_info reginfo;  /* create some info to pass to regtry etc */
1992  regexp_paren_pair *swap = NULL;
1993  GET_RE_DEBUG_FLAGS_DECL;
1994
1995  PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1996  PERL_UNUSED_ARG(data);
1997
1998  /* Be paranoid... */
1999  if (prog == NULL || startpos == NULL) {
2000   Perl_croak(aTHX_ "NULL regexp parameter");
2001   return 0;
2002  }
2003
2004  multiline = prog->extflags & RXf_PMf_MULTILINE;
2005  reginfo.prog = rx;  /* Yes, sorry that this is confusing.  */
2006
2007  RX_MATCH_UTF8_set(rx, utf8_target);
2008  DEBUG_EXECUTE_r(
2009   debug_start_match(rx, utf8_target, startpos, strend,
2010   "Matching");
2011  );
2012
2013  minlen = prog->minlen;
2014
2015  if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2016   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2017        "String too short [regexec_flags]...\n"));
2018   goto phooey;
2019  }
2020
2021
2022  /* Check validity of program. */
2023  if (UCHARAT(progi->program) != REG_MAGIC) {
2024   Perl_croak(aTHX_ "corrupted regexp program");
2025  }
2026
2027  PL_reg_flags = 0;
2028  PL_reg_eval_set = 0;
2029  PL_reg_maxiter = 0;
2030
2031  if (RX_UTF8(rx))
2032   PL_reg_flags |= RF_utf8;
2033
2034  /* Mark beginning of line for ^ and lookbehind. */
2035  reginfo.bol = startpos; /* XXX not used ??? */
2036  PL_bostr  = strbeg;
2037  reginfo.sv = sv;
2038
2039  /* Mark end of line for $ (and such) */
2040  PL_regeol = strend;
2041
2042  /* see how far we have to get to not match where we matched before */
2043  reginfo.till = startpos+minend;
2044
2045  /* If there is a "must appear" string, look for it. */
2046  s = startpos;
2047
2048  if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2049   MAGIC *mg;
2050   if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2051    reginfo.ganch = startpos + prog->gofs;
2052    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2053    "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2054   } else if (sv && SvTYPE(sv) >= SVt_PVMG
2055     && SvMAGIC(sv)
2056     && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2057     && mg->mg_len >= 0) {
2058    reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2059    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2060     "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2061
2062    if (prog->extflags & RXf_ANCH_GPOS) {
2063     if (s > reginfo.ganch)
2064      goto phooey;
2065     s = reginfo.ganch - prog->gofs;
2066     DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2067      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2068     if (s < strbeg)
2069      goto phooey;
2070    }
2071   }
2072   else if (data) {
2073    reginfo.ganch = strbeg + PTR2UV(data);
2074    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2075     "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2076
2077   } else {    /* pos() not defined */
2078    reginfo.ganch = strbeg;
2079    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2080     "GPOS: reginfo.ganch = strbeg\n"));
2081   }
2082  }
2083  if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2084   /* We have to be careful. If the previous successful match
2085   was from this regex we don't want a subsequent partially
2086   successful match to clobber the old results.
2087   So when we detect this possibility we add a swap buffer
2088   to the re, and switch the buffer each match. If we fail
2089   we switch it back, otherwise we leave it swapped.
2090   */
2091   swap = prog->offs;
2092   /* do we need a save destructor here for eval dies? */
2093   Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2094  }
2095  if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2096   re_scream_pos_data d;
2097
2098   d.scream_olds = &scream_olds;
2099   d.scream_pos = &scream_pos;
2100   s = re_intuit_start(rx, sv, s, strend, flags, &d);
2101   if (!s) {
2102    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2103    goto phooey; /* not present */
2104   }
2105  }
2106
2107
2108
2109  /* Simplest case:  anchored match need be tried only once. */
2110  /*  [unless only anchor is BOL and multiline is set] */
2111  if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2112   if (s == startpos && regtry(&reginfo, &startpos))
2113    goto got_it;
2114   else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2115     || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2116   {
2117    char *end;
2118
2119    if (minlen)
2120     dontbother = minlen - 1;
2121    end = HOP3c(strend, -dontbother, strbeg) - 1;
2122    /* for multiline we only have to try after newlines */
2123    if (prog->check_substr || prog->check_utf8) {
2124     /* because of the goto we can not easily reuse the macros for bifurcating the
2125     unicode/non-unicode match modes here like we do elsewhere - demerphq */
2126     if (utf8_target) {
2127      if (s == startpos)
2128       goto after_try_utf8;
2129      while (1) {
2130       if (regtry(&reginfo, &s)) {
2131        goto got_it;
2132       }
2133      after_try_utf8:
2134       if (s > end) {
2135        goto phooey;
2136       }
2137       if (prog->extflags & RXf_USE_INTUIT) {
2138        s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2139        if (!s) {
2140         goto phooey;
2141        }
2142       }
2143       else {
2144        s += UTF8SKIP(s);
2145       }
2146      }
2147     } /* end search for check string in unicode */
2148     else {
2149      if (s == startpos) {
2150       goto after_try_latin;
2151      }
2152      while (1) {
2153       if (regtry(&reginfo, &s)) {
2154        goto got_it;
2155       }
2156      after_try_latin:
2157       if (s > end) {
2158        goto phooey;
2159       }
2160       if (prog->extflags & RXf_USE_INTUIT) {
2161        s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2162        if (!s) {
2163         goto phooey;
2164        }
2165       }
2166       else {
2167        s++;
2168       }
2169      }
2170     } /* end search for check string in latin*/
2171    } /* end search for check string */
2172    else { /* search for newline */
2173     if (s > startpos) {
2174      /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2175      s--;
2176     }
2177     /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2178     while (s < end) {
2179      if (*s++ == '\n') { /* don't need PL_utf8skip here */
2180       if (regtry(&reginfo, &s))
2181        goto got_it;
2182      }
2183     }
2184    } /* end search for newline */
2185   } /* end anchored/multiline check string search */
2186   goto phooey;
2187  } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2188  {
2189   /* the warning about reginfo.ganch being used without initialization
2190   is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2191   and we only enter this block when the same bit is set. */
2192   char *tmp_s = reginfo.ganch - prog->gofs;
2193
2194   if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2195    goto got_it;
2196   goto phooey;
2197  }
2198
2199  /* Messy cases:  unanchored match. */
2200  if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2201   /* we have /x+whatever/ */
2202   /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2203   char ch;
2204 #ifdef DEBUGGING
2205   int did_match = 0;
2206 #endif
2207   if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2208    utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2209   ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2210
2211   if (utf8_target) {
2212    REXEC_FBC_SCAN(
2213     if (*s == ch) {
2214      DEBUG_EXECUTE_r( did_match = 1 );
2215      if (regtry(&reginfo, &s)) goto got_it;
2216      s += UTF8SKIP(s);
2217      while (s < strend && *s == ch)
2218       s += UTF8SKIP(s);
2219     }
2220    );
2221   }
2222   else {
2223    REXEC_FBC_SCAN(
2224     if (*s == ch) {
2225      DEBUG_EXECUTE_r( did_match = 1 );
2226      if (regtry(&reginfo, &s)) goto got_it;
2227      s++;
2228      while (s < strend && *s == ch)
2229       s++;
2230     }
2231    );
2232   }
2233   DEBUG_EXECUTE_r(if (!did_match)
2234     PerlIO_printf(Perl_debug_log,
2235         "Did not find anchored character...\n")
2236    );
2237  }
2238  else if (prog->anchored_substr != NULL
2239    || prog->anchored_utf8 != NULL
2240    || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2241     && prog->float_max_offset < strend - s)) {
2242   SV *must;
2243   I32 back_max;
2244   I32 back_min;
2245   char *last;
2246   char *last1;  /* Last position checked before */
2247 #ifdef DEBUGGING
2248   int did_match = 0;
2249 #endif
2250   if (prog->anchored_substr || prog->anchored_utf8) {
2251    if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2252     utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2253    must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2254    back_max = back_min = prog->anchored_offset;
2255   } else {
2256    if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2257     utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2258    must = utf8_target ? prog->float_utf8 : prog->float_substr;
2259    back_max = prog->float_max_offset;
2260    back_min = prog->float_min_offset;
2261   }
2262
2263
2264   if (must == &PL_sv_undef)
2265    /* could not downgrade utf8 check substring, so must fail */
2266    goto phooey;
2267
2268   if (back_min<0) {
2269    last = strend;
2270   } else {
2271    last = HOP3c(strend, /* Cannot start after this */
2272     -(I32)(CHR_SVLEN(must)
2273       - (SvTAIL(must) != 0) + back_min), strbeg);
2274   }
2275   if (s > PL_bostr)
2276    last1 = HOPc(s, -1);
2277   else
2278    last1 = s - 1; /* bogus */
2279
2280   /* XXXX check_substr already used to find "s", can optimize if
2281   check_substr==must. */
2282   scream_pos = -1;
2283   dontbother = end_shift;
2284   strend = HOPc(strend, -dontbother);
2285   while ( (s <= last) &&
2286     ((flags & REXEC_SCREAM)
2287     ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2288          end_shift, &scream_pos, 0))
2289     : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2290         (unsigned char*)strend, must,
2291         multiline ? FBMrf_MULTILINE : 0))) ) {
2292    /* we may be pointing at the wrong string */
2293    if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2294     s = strbeg + (s - SvPVX_const(sv));
2295    DEBUG_EXECUTE_r( did_match = 1 );
2296    if (HOPc(s, -back_max) > last1) {
2297     last1 = HOPc(s, -back_min);
2298     s = HOPc(s, -back_max);
2299    }
2300    else {
2301     char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2302
2303     last1 = HOPc(s, -back_min);
2304     s = t;
2305    }
2306    if (utf8_target) {
2307     while (s <= last1) {
2308      if (regtry(&reginfo, &s))
2309       goto got_it;
2310      s += UTF8SKIP(s);
2311     }
2312    }
2313    else {
2314     while (s <= last1) {
2315      if (regtry(&reginfo, &s))
2316       goto got_it;
2317      s++;
2318     }
2319    }
2320   }
2321   DEBUG_EXECUTE_r(if (!did_match) {
2322    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2323     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2324    PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2325        ((must == prog->anchored_substr || must == prog->anchored_utf8)
2326        ? "anchored" : "floating"),
2327     quoted, RE_SV_TAIL(must));
2328   });
2329   goto phooey;
2330  }
2331  else if ( (c = progi->regstclass) ) {
2332   if (minlen) {
2333    const OPCODE op = OP(progi->regstclass);
2334    /* don't bother with what can't match */
2335    if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2336     strend = HOPc(strend, -(minlen - 1));
2337   }
2338   DEBUG_EXECUTE_r({
2339    SV * const prop = sv_newmortal();
2340    regprop(prog, prop, c);
2341    {
2342     RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2343      s,strend-s,60);
2344     PerlIO_printf(Perl_debug_log,
2345      "Matching stclass %.*s against %s (%d bytes)\n",
2346      (int)SvCUR(prop), SvPVX_const(prop),
2347      quoted, (int)(strend - s));
2348    }
2349   });
2350   if (find_byclass(prog, c, s, strend, &reginfo))
2351    goto got_it;
2352   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2353  }
2354  else {
2355   dontbother = 0;
2356   if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2357    /* Trim the end. */
2358    char *last;
2359    SV* float_real;
2360
2361    if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2362     utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2363    float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2364
2365    if (flags & REXEC_SCREAM) {
2366     last = screaminstr(sv, float_real, s - strbeg,
2367         end_shift, &scream_pos, 1); /* last one */
2368     if (!last)
2369      last = scream_olds; /* Only one occurrence. */
2370     /* we may be pointing at the wrong string */
2371     else if (RXp_MATCH_COPIED(prog))
2372      s = strbeg + (s - SvPVX_const(sv));
2373    }
2374    else {
2375     STRLEN len;
2376     const char * const little = SvPV_const(float_real, len);
2377
2378     if (SvTAIL(float_real)) {
2379      if (memEQ(strend - len + 1, little, len - 1))
2380       last = strend - len + 1;
2381      else if (!multiline)
2382       last = memEQ(strend - len, little, len)
2383        ? strend - len : NULL;
2384      else
2385       goto find_last;
2386     } else {
2387     find_last:
2388      if (len)
2389       last = rninstr(s, strend, little, little + len);
2390      else
2391       last = strend; /* matching "$" */
2392     }
2393    }
2394    if (last == NULL) {
2395     DEBUG_EXECUTE_r(
2396      PerlIO_printf(Perl_debug_log,
2397       "%sCan't trim the tail, match fails (should not happen)%s\n",
2398       PL_colors[4], PL_colors[5]));
2399     goto phooey; /* Should not happen! */
2400    }
2401    dontbother = strend - last + prog->float_min_offset;
2402   }
2403   if (minlen && (dontbother < minlen))
2404    dontbother = minlen - 1;
2405   strend -= dontbother;      /* this one's always in bytes! */
2406   /* We don't know much -- general case. */
2407   if (utf8_target) {
2408    for (;;) {
2409     if (regtry(&reginfo, &s))
2410      goto got_it;
2411     if (s >= strend)
2412      break;
2413     s += UTF8SKIP(s);
2414    };
2415   }
2416   else {
2417    do {
2418     if (regtry(&reginfo, &s))
2419      goto got_it;
2420    } while (s++ < strend);
2421   }
2422  }
2423
2424  /* Failure. */
2425  goto phooey;
2426
2427 got_it:
2428  Safefree(swap);
2429  RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2430
2431  if (PL_reg_eval_set)
2432   restore_pos(aTHX_ prog);
2433  if (RXp_PAREN_NAMES(prog))
2434   (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2435
2436  /* make sure $`, $&, $', and $digit will work later */
2437  if ( !(flags & REXEC_NOT_FIRST) ) {
2438   RX_MATCH_COPY_FREE(rx);
2439   if (flags & REXEC_COPY_STR) {
2440    const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2441 #ifdef PERL_OLD_COPY_ON_WRITE
2442    if ((SvIsCOW(sv)
2443     || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2444     if (DEBUG_C_TEST) {
2445      PerlIO_printf(Perl_debug_log,
2446         "Copy on write: regexp capture, type %d\n",
2447         (int) SvTYPE(sv));
2448     }
2449     prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2450     prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2451     assert (SvPOKp(prog->saved_copy));
2452    } else
2453 #endif
2454    {
2455     RX_MATCH_COPIED_on(rx);
2456     s = savepvn(strbeg, i);
2457     prog->subbeg = s;
2458    }
2459    prog->sublen = i;
2460   }
2461   else {
2462    prog->subbeg = strbeg;
2463    prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2464   }
2465  }
2466
2467  return 1;
2468
2469 phooey:
2470  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2471       PL_colors[4], PL_colors[5]));
2472  if (PL_reg_eval_set)
2473   restore_pos(aTHX_ prog);
2474  if (swap) {
2475   /* we failed :-( roll it back */
2476   Safefree(prog->offs);
2477   prog->offs = swap;
2478  }
2479
2480  return 0;
2481 }
2482
2483
2484 /*
2485  - regtry - try match at specific point
2486  */
2487 STATIC I32   /* 0 failure, 1 success */
2488 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2489 {
2490  dVAR;
2491  CHECKPOINT lastcp;
2492  REGEXP *const rx = reginfo->prog;
2493  regexp *const prog = (struct regexp *)SvANY(rx);
2494  RXi_GET_DECL(prog,progi);
2495  GET_RE_DEBUG_FLAGS_DECL;
2496
2497  PERL_ARGS_ASSERT_REGTRY;
2498
2499  reginfo->cutpoint=NULL;
2500
2501  if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2502   MAGIC *mg;
2503
2504   PL_reg_eval_set = RS_init;
2505   DEBUG_EXECUTE_r(DEBUG_s(
2506    PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2507       (IV)(PL_stack_sp - PL_stack_base));
2508    ));
2509   SAVESTACK_CXPOS();
2510   cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2511   /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2512   SAVETMPS;
2513   /* Apparently this is not needed, judging by wantarray. */
2514   /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2515   cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2516
2517   if (reginfo->sv) {
2518    /* Make $_ available to executed code. */
2519    if (reginfo->sv != DEFSV) {
2520     SAVE_DEFSV;
2521     DEFSV_set(reginfo->sv);
2522    }
2523
2524    if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2525     && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2526     /* prepare for quick setting of pos */
2527 #ifdef PERL_OLD_COPY_ON_WRITE
2528     if (SvIsCOW(reginfo->sv))
2529      sv_force_normal_flags(reginfo->sv, 0);
2530 #endif
2531     mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2532         &PL_vtbl_mglob, NULL, 0);
2533     mg->mg_len = -1;
2534    }
2535    PL_reg_magic    = mg;
2536    PL_reg_oldpos   = mg->mg_len;
2537    SAVEDESTRUCTOR_X(restore_pos, prog);
2538   }
2539   if (!PL_reg_curpm) {
2540    Newxz(PL_reg_curpm, 1, PMOP);
2541 #ifdef USE_ITHREADS
2542    {
2543     SV* const repointer = &PL_sv_undef;
2544     /* this regexp is also owned by the new PL_reg_curpm, which
2545     will try to free it.  */
2546     av_push(PL_regex_padav, repointer);
2547     PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2548     PL_regex_pad = AvARRAY(PL_regex_padav);
2549    }
2550 #endif
2551   }
2552 #ifdef USE_ITHREADS
2553   /* It seems that non-ithreads works both with and without this code.
2554   So for efficiency reasons it seems best not to have the code
2555   compiled when it is not needed.  */
2556   /* This is safe against NULLs: */
2557   ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2558   /* PM_reg_curpm owns a reference to this regexp.  */
2559   (void)ReREFCNT_inc(rx);
2560 #endif
2561   PM_SETRE(PL_reg_curpm, rx);
2562   PL_reg_oldcurpm = PL_curpm;
2563   PL_curpm = PL_reg_curpm;
2564   if (RXp_MATCH_COPIED(prog)) {
2565    /*  Here is a serious problem: we cannot rewrite subbeg,
2566     since it may be needed if this match fails.  Thus
2567     $` inside (?{}) could fail... */
2568    PL_reg_oldsaved = prog->subbeg;
2569    PL_reg_oldsavedlen = prog->sublen;
2570 #ifdef PERL_OLD_COPY_ON_WRITE
2571    PL_nrs = prog->saved_copy;
2572 #endif
2573    RXp_MATCH_COPIED_off(prog);
2574   }
2575   else
2576    PL_reg_oldsaved = NULL;
2577   prog->subbeg = PL_bostr;
2578   prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2579  }
2580  DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2581  prog->offs[0].start = *startpos - PL_bostr;
2582  PL_reginput = *startpos;
2583  PL_reglastparen = &prog->lastparen;
2584  PL_reglastcloseparen = &prog->lastcloseparen;
2585  prog->lastparen = 0;
2586  prog->lastcloseparen = 0;
2587  PL_regsize = 0;
2588  PL_regoffs = prog->offs;
2589  if (PL_reg_start_tmpl <= prog->nparens) {
2590   PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2591   if(PL_reg_start_tmp)
2592    Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2593   else
2594    Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2595  }
2596
2597  /* XXXX What this code is doing here?!!!  There should be no need
2598  to do this again and again, PL_reglastparen should take care of
2599  this!  --ilya*/
2600
2601  /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2602  * Actually, the code in regcppop() (which Ilya may be meaning by
2603  * PL_reglastparen), is not needed at all by the test suite
2604  * (op/regexp, op/pat, op/split), but that code is needed otherwise
2605  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2606  * Meanwhile, this code *is* needed for the
2607  * above-mentioned test suite tests to succeed.  The common theme
2608  * on those tests seems to be returning null fields from matches.
2609  * --jhi updated by dapm */
2610 #if 1
2611  if (prog->nparens) {
2612   regexp_paren_pair *pp = PL_regoffs;
2613   register I32 i;
2614   for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2615    ++pp;
2616    pp->start = -1;
2617    pp->end = -1;
2618   }
2619  }
2620 #endif
2621  REGCP_SET(lastcp);
2622  if (regmatch(reginfo, progi->program + 1)) {
2623   PL_regoffs[0].end = PL_reginput - PL_bostr;
2624   return 1;
2625  }
2626  if (reginfo->cutpoint)
2627   *startpos= reginfo->cutpoint;
2628  REGCP_UNWIND(lastcp);
2629  return 0;
2630 }
2631
2632
2633 #define sayYES goto yes
2634 #define sayNO goto no
2635 #define sayNO_SILENT goto no_silent
2636
2637 /* we dont use STMT_START/END here because it leads to
2638    "unreachable code" warnings, which are bogus, but distracting. */
2639 #define CACHEsayNO \
2640  if (ST.cache_mask) \
2641  PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2642  sayNO
2643
2644 /* this is used to determine how far from the left messages like
2645    'failed...' are printed. It should be set such that messages
2646    are inline with the regop output that created them.
2647 */
2648 #define REPORT_CODE_OFF 32
2649
2650
2651 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2652 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2653
2654 #define SLAB_FIRST(s) (&(s)->states[0])
2655 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2656
2657 /* grab a new slab and return the first slot in it */
2658
2659 STATIC regmatch_state *
2660 S_push_slab(pTHX)
2661 {
2662 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2663  dMY_CXT;
2664 #endif
2665  regmatch_slab *s = PL_regmatch_slab->next;
2666  if (!s) {
2667   Newx(s, 1, regmatch_slab);
2668   s->prev = PL_regmatch_slab;
2669   s->next = NULL;
2670   PL_regmatch_slab->next = s;
2671  }
2672  PL_regmatch_slab = s;
2673  return SLAB_FIRST(s);
2674 }
2675
2676
2677 /* push a new state then goto it */
2678
2679 #define PUSH_STATE_GOTO(state, node) \
2680  scan = node; \
2681  st->resume_state = state; \
2682  goto push_state;
2683
2684 /* push a new state with success backtracking, then goto it */
2685
2686 #define PUSH_YES_STATE_GOTO(state, node) \
2687  scan = node; \
2688  st->resume_state = state; \
2689  goto push_yes_state;
2690
2691
2692
2693 /*
2694
2695 regmatch() - main matching routine
2696
2697 This is basically one big switch statement in a loop. We execute an op,
2698 set 'next' to point the next op, and continue. If we come to a point which
2699 we may need to backtrack to on failure such as (A|B|C), we push a
2700 backtrack state onto the backtrack stack. On failure, we pop the top
2701 state, and re-enter the loop at the state indicated. If there are no more
2702 states to pop, we return failure.
2703
2704 Sometimes we also need to backtrack on success; for example /A+/, where
2705 after successfully matching one A, we need to go back and try to
2706 match another one; similarly for lookahead assertions: if the assertion
2707 completes successfully, we backtrack to the state just before the assertion
2708 and then carry on.  In these cases, the pushed state is marked as
2709 'backtrack on success too'. This marking is in fact done by a chain of
2710 pointers, each pointing to the previous 'yes' state. On success, we pop to
2711 the nearest yes state, discarding any intermediate failure-only states.
2712 Sometimes a yes state is pushed just to force some cleanup code to be
2713 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2714 it to free the inner regex.
2715
2716 Note that failure backtracking rewinds the cursor position, while
2717 success backtracking leaves it alone.
2718
2719 A pattern is complete when the END op is executed, while a subpattern
2720 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2721 ops trigger the "pop to last yes state if any, otherwise return true"
2722 behaviour.
2723
2724 A common convention in this function is to use A and B to refer to the two
2725 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2726 the subpattern to be matched possibly multiple times, while B is the entire
2727 rest of the pattern. Variable and state names reflect this convention.
2728
2729 The states in the main switch are the union of ops and failure/success of
2730 substates associated with with that op.  For example, IFMATCH is the op
2731 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2732 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2733 successfully matched A and IFMATCH_A_fail is a state saying that we have
2734 just failed to match A. Resume states always come in pairs. The backtrack
2735 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2736 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2737 on success or failure.
2738
2739 The struct that holds a backtracking state is actually a big union, with
2740 one variant for each major type of op. The variable st points to the
2741 top-most backtrack struct. To make the code clearer, within each
2742 block of code we #define ST to alias the relevant union.
2743
2744 Here's a concrete example of a (vastly oversimplified) IFMATCH
2745 implementation:
2746
2747  switch (state) {
2748  ....
2749
2750 #define ST st->u.ifmatch
2751
2752  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2753   ST.foo = ...; // some state we wish to save
2754   ...
2755   // push a yes backtrack state with a resume value of
2756   // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2757   // first node of A:
2758   PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2759   // NOTREACHED
2760
2761  case IFMATCH_A: // we have successfully executed A; now continue with B
2762   next = B;
2763   bar = ST.foo; // do something with the preserved value
2764   break;
2765
2766  case IFMATCH_A_fail: // A failed, so the assertion failed
2767   ...;   // do some housekeeping, then ...
2768   sayNO; // propagate the failure
2769
2770 #undef ST
2771
2772  ...
2773  }
2774
2775 For any old-timers reading this who are familiar with the old recursive
2776 approach, the code above is equivalent to:
2777
2778  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2779  {
2780   int foo = ...
2781   ...
2782   if (regmatch(A)) {
2783    next = B;
2784    bar = foo;
2785    break;
2786   }
2787   ...;   // do some housekeeping, then ...
2788   sayNO; // propagate the failure
2789  }
2790
2791 The topmost backtrack state, pointed to by st, is usually free. If you
2792 want to claim it, populate any ST.foo fields in it with values you wish to
2793 save, then do one of
2794
2795   PUSH_STATE_GOTO(resume_state, node);
2796   PUSH_YES_STATE_GOTO(resume_state, node);
2797
2798 which sets that backtrack state's resume value to 'resume_state', pushes a
2799 new free entry to the top of the backtrack stack, then goes to 'node'.
2800 On backtracking, the free slot is popped, and the saved state becomes the
2801 new free state. An ST.foo field in this new top state can be temporarily
2802 accessed to retrieve values, but once the main loop is re-entered, it
2803 becomes available for reuse.
2804
2805 Note that the depth of the backtrack stack constantly increases during the
2806 left-to-right execution of the pattern, rather than going up and down with
2807 the pattern nesting. For example the stack is at its maximum at Z at the
2808 end of the pattern, rather than at X in the following:
2809
2810  /(((X)+)+)+....(Y)+....Z/
2811
2812 The only exceptions to this are lookahead/behind assertions and the cut,
2813 (?>A), which pop all the backtrack states associated with A before
2814 continuing.
2815
2816 Backtrack state structs are allocated in slabs of about 4K in size.
2817 PL_regmatch_state and st always point to the currently active state,
2818 and PL_regmatch_slab points to the slab currently containing
2819 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2820 allocated, and is never freed until interpreter destruction. When the slab
2821 is full, a new one is allocated and chained to the end. At exit from
2822 regmatch(), slabs allocated since entry are freed.
2823
2824 */
2825
2826
2827 #define DEBUG_STATE_pp(pp)        \
2828  DEBUG_STATE_r({         \
2829   DUMP_EXEC_POS(locinput, scan, utf8_target);      \
2830   PerlIO_printf(Perl_debug_log,       \
2831    "    %*s"pp" %s%s%s%s%s\n",       \
2832    depth*2, "",        \
2833    PL_reg_name[st->resume_state],                     \
2834    ((st==yes_state||st==mark_state) ? "[" : ""),   \
2835    ((st==yes_state) ? "Y" : ""),                   \
2836    ((st==mark_state) ? "M" : ""),                  \
2837    ((st==yes_state||st==mark_state) ? "]" : "")    \
2838   );                                                  \
2839  });
2840
2841
2842 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2843
2844 #ifdef DEBUGGING
2845
2846 STATIC void
2847 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2848  const char *start, const char *end, const char *blurb)
2849 {
2850  const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2851
2852  PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2853
2854  if (!PL_colorset)
2855    reginitcolors();
2856  {
2857   RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2858    RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2859
2860   RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2861    start, end - start, 60);
2862
2863   PerlIO_printf(Perl_debug_log,
2864    "%s%s REx%s %s against %s\n",
2865      PL_colors[4], blurb, PL_colors[5], s0, s1);
2866
2867   if (utf8_target||utf8_pat)
2868    PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2869     utf8_pat ? "pattern" : "",
2870     utf8_pat && utf8_target ? " and " : "",
2871     utf8_target ? "string" : ""
2872    );
2873  }
2874 }
2875
2876 STATIC void
2877 S_dump_exec_pos(pTHX_ const char *locinput,
2878      const regnode *scan,
2879      const char *loc_regeol,
2880      const char *loc_bostr,
2881      const char *loc_reg_starttry,
2882      const bool utf8_target)
2883 {
2884  const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2885  const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2886  int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2887  /* The part of the string before starttry has one color
2888  (pref0_len chars), between starttry and current
2889  position another one (pref_len - pref0_len chars),
2890  after the current position the third one.
2891  We assume that pref0_len <= pref_len, otherwise we
2892  decrease pref0_len.  */
2893  int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2894   ? (5 + taill) - l : locinput - loc_bostr;
2895  int pref0_len;
2896
2897  PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2898
2899  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2900   pref_len++;
2901  pref0_len = pref_len  - (locinput - loc_reg_starttry);
2902  if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2903   l = ( loc_regeol - locinput > (5 + taill) - pref_len
2904    ? (5 + taill) - pref_len : loc_regeol - locinput);
2905  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2906   l--;
2907  if (pref0_len < 0)
2908   pref0_len = 0;
2909  if (pref0_len > pref_len)
2910   pref0_len = pref_len;
2911  {
2912   const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2913
2914   RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2915    (locinput - pref_len),pref0_len, 60, 4, 5);
2916
2917   RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2918      (locinput - pref_len + pref0_len),
2919      pref_len - pref0_len, 60, 2, 3);
2920
2921   RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2922      locinput, loc_regeol - locinput, 10, 0, 1);
2923
2924   const STRLEN tlen=len0+len1+len2;
2925   PerlIO_printf(Perl_debug_log,
2926      "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2927      (IV)(locinput - loc_bostr),
2928      len0, s0,
2929      len1, s1,
2930      (docolor ? "" : "> <"),
2931      len2, s2,
2932      (int)(tlen > 19 ? 0 :  19 - tlen),
2933      "");
2934  }
2935 }
2936
2937 #endif
2938
2939 /* reg_check_named_buff_matched()
2940  * Checks to see if a named buffer has matched. The data array of
2941  * buffer numbers corresponding to the buffer is expected to reside
2942  * in the regexp->data->data array in the slot stored in the ARG() of
2943  * node involved. Note that this routine doesn't actually care about the
2944  * name, that information is not preserved from compilation to execution.
2945  * Returns the index of the leftmost defined buffer with the given name
2946  * or 0 if non of the buffers matched.
2947  */
2948 STATIC I32
2949 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2950 {
2951  I32 n;
2952  RXi_GET_DECL(rex,rexi);
2953  SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2954  I32 *nums=(I32*)SvPVX(sv_dat);
2955
2956  PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2957
2958  for ( n=0; n<SvIVX(sv_dat); n++ ) {
2959   if ((I32)*PL_reglastparen >= nums[n] &&
2960    PL_regoffs[nums[n]].end != -1)
2961   {
2962    return nums[n];
2963   }
2964  }
2965  return 0;
2966 }
2967
2968
2969 /* free all slabs above current one  - called during LEAVE_SCOPE */
2970
2971 STATIC void
2972 S_clear_backtrack_stack(pTHX_ void *p)
2973 {
2974  regmatch_slab *s = PL_regmatch_slab->next;
2975  PERL_UNUSED_ARG(p);
2976
2977  if (!s)
2978   return;
2979  PL_regmatch_slab->next = NULL;
2980  while (s) {
2981   regmatch_slab * const osl = s;
2982   s = s->next;
2983   Safefree(osl);
2984  }
2985 }
2986
2987
2988 #define SETREX(Re1,Re2) \
2989  if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2990  Re1 = (Re2)
2991
2992 STATIC I32   /* 0 failure, 1 success */
2993 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2994 {
2995 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2996  dMY_CXT;
2997 #endif
2998  dVAR;
2999  register const bool utf8_target = PL_reg_match_utf8;
3000  const U32 uniflags = UTF8_ALLOW_DEFAULT;
3001  REGEXP *rex_sv = reginfo->prog;
3002  regexp *rex = (struct regexp *)SvANY(rex_sv);
3003  RXi_GET_DECL(rex,rexi);
3004  I32 oldsave;
3005  /* the current state. This is a cached copy of PL_regmatch_state */
3006  register regmatch_state *st;
3007  /* cache heavy used fields of st in registers */
3008  register regnode *scan;
3009  register regnode *next;
3010  register U32 n = 0; /* general value; init to avoid compiler warning */
3011  register I32 ln = 0; /* len or last;  init to avoid compiler warning */
3012  register char *locinput = PL_reginput;
3013  register I32 nextchr;   /* is always set to UCHARAT(locinput) */
3014
3015  bool result = 0;     /* return value of S_regmatch */
3016  int depth = 0;     /* depth of backtrack stack */
3017  U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3018  const U32 max_nochange_depth =
3019   (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3020   3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3021  regmatch_state *yes_state = NULL; /* state to pop to on success of
3022                subpattern */
3023  /* mark_state piggy backs on the yes_state logic so that when we unwind
3024  the stack on success we can update the mark_state as we go */
3025  regmatch_state *mark_state = NULL; /* last mark state we have seen */
3026  regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3027  struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3028  U32 state_num;
3029  bool no_final = 0;      /* prevent failure from backtracking? */
3030  bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3031  char *startpoint = PL_reginput;
3032  SV *popmark = NULL;     /* are we looking for a mark? */
3033  SV *sv_commit = NULL;   /* last mark name seen in failure */
3034  SV *sv_yes_mark = NULL; /* last mark name we have seen
3035        during a successful match */
3036  U32 lastopen = 0;       /* last open we saw */
3037  bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3038  SV* const oreplsv = GvSV(PL_replgv);
3039  /* these three flags are set by various ops to signal information to
3040  * the very next op. They have a useful lifetime of exactly one loop
3041  * iteration, and are not preserved or restored by state pushes/pops
3042  */
3043  bool sw = 0;     /* the condition value in (?(cond)a|b) */
3044  bool minmod = 0;     /* the next "{n,m}" is a "{n,m}?" */
3045  int logical = 0;     /* the following EVAL is:
3046         0: (?{...})
3047         1: (?(?{...})X|Y)
3048         2: (??{...})
3049        or the following IFMATCH/UNLESSM is:
3050         false: plain (?=foo)
3051         true:  used as a condition: (?(?=foo))
3052        */
3053 #ifdef DEBUGGING
3054  GET_RE_DEBUG_FLAGS_DECL;
3055 #endif
3056
3057  PERL_ARGS_ASSERT_REGMATCH;
3058
3059  DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3060    PerlIO_printf(Perl_debug_log,"regmatch start\n");
3061  }));
3062  /* on first ever call to regmatch, allocate first slab */
3063  if (!PL_regmatch_slab) {
3064   Newx(PL_regmatch_slab, 1, regmatch_slab);
3065   PL_regmatch_slab->prev = NULL;
3066   PL_regmatch_slab->next = NULL;
3067   PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3068  }
3069
3070  oldsave = PL_savestack_ix;
3071  SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3072  SAVEVPTR(PL_regmatch_slab);
3073  SAVEVPTR(PL_regmatch_state);
3074
3075  /* grab next free state slot */
3076  st = ++PL_regmatch_state;
3077  if (st >  SLAB_LAST(PL_regmatch_slab))
3078   st = PL_regmatch_state = S_push_slab(aTHX);
3079
3080  /* Note that nextchr is a byte even in UTF */
3081  nextchr = UCHARAT(locinput);
3082  scan = prog;
3083  while (scan != NULL) {
3084
3085   DEBUG_EXECUTE_r( {
3086    SV * const prop = sv_newmortal();
3087    regnode *rnext=regnext(scan);
3088    DUMP_EXEC_POS( locinput, scan, utf8_target );
3089    regprop(rex, prop, scan);
3090
3091    PerlIO_printf(Perl_debug_log,
3092      "%3"IVdf":%*s%s(%"IVdf")\n",
3093      (IV)(scan - rexi->program), depth*2, "",
3094      SvPVX_const(prop),
3095      (PL_regkind[OP(scan)] == END || !rnext) ?
3096       0 : (IV)(rnext - rexi->program));
3097   });
3098
3099   next = scan + NEXT_OFF(scan);
3100   if (next == scan)
3101    next = NULL;
3102   state_num = OP(scan);
3103
3104   REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3105  reenter_switch:
3106
3107   assert(PL_reglastparen == &rex->lastparen);
3108   assert(PL_reglastcloseparen == &rex->lastcloseparen);
3109   assert(PL_regoffs == rex->offs);
3110
3111   switch (state_num) {
3112   case BOL:
3113    if (locinput == PL_bostr)
3114    {
3115     /* reginfo->till = reginfo->bol; */
3116     break;
3117    }
3118    sayNO;
3119   case MBOL:
3120    if (locinput == PL_bostr ||
3121     ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3122    {
3123     break;
3124    }
3125    sayNO;
3126   case SBOL:
3127    if (locinput == PL_bostr)
3128     break;
3129    sayNO;
3130   case GPOS:
3131    if (locinput == reginfo->ganch)
3132     break;
3133    sayNO;
3134
3135   case KEEPS:
3136    /* update the startpoint */
3137    st->u.keeper.val = PL_regoffs[0].start;
3138    PL_reginput = locinput;
3139    PL_regoffs[0].start = locinput - PL_bostr;
3140    PUSH_STATE_GOTO(KEEPS_next, next);
3141    /*NOT-REACHED*/
3142   case KEEPS_next_fail:
3143    /* rollback the start point change */
3144    PL_regoffs[0].start = st->u.keeper.val;
3145    sayNO_SILENT;
3146    /*NOT-REACHED*/
3147   case EOL:
3148     goto seol;
3149   case MEOL:
3150    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3151     sayNO;
3152    break;
3153   case SEOL:
3154   seol:
3155    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3156     sayNO;
3157    if (PL_regeol - locinput > 1)
3158     sayNO;
3159    break;
3160   case EOS:
3161    if (PL_regeol != locinput)
3162     sayNO;
3163    break;
3164   case SANY:
3165    if (!nextchr && locinput >= PL_regeol)
3166     sayNO;
3167    if (utf8_target) {
3168     locinput += PL_utf8skip[nextchr];
3169     if (locinput > PL_regeol)
3170      sayNO;
3171     nextchr = UCHARAT(locinput);
3172    }
3173    else
3174     nextchr = UCHARAT(++locinput);
3175    break;
3176   case CANY:
3177    if (!nextchr && locinput >= PL_regeol)
3178     sayNO;
3179    nextchr = UCHARAT(++locinput);
3180    break;
3181   case REG_ANY:
3182    if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3183     sayNO;
3184    if (utf8_target) {
3185     locinput += PL_utf8skip[nextchr];
3186     if (locinput > PL_regeol)
3187      sayNO;
3188     nextchr = UCHARAT(locinput);
3189    }
3190    else
3191     nextchr = UCHARAT(++locinput);
3192    break;
3193
3194 #undef  ST
3195 #define ST st->u.trie
3196   case TRIEC:
3197    /* In this case the charclass data is available inline so
3198    we can fail fast without a lot of extra overhead.
3199    */
3200    if (scan->flags == EXACT || !utf8_target) {
3201     if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3202      DEBUG_EXECUTE_r(
3203       PerlIO_printf(Perl_debug_log,
3204          "%*s  %sfailed to match trie start class...%s\n",
3205          REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3206      );
3207      sayNO_SILENT;
3208      /* NOTREACHED */
3209     }
3210    }
3211    /* FALL THROUGH */
3212   case TRIE:
3213    /* the basic plan of execution of the trie is:
3214    * At the beginning, run though all the states, and
3215    * find the longest-matching word. Also remember the position
3216    * of the shortest matching word. For example, this pattern:
3217    *    1  2 3 4    5
3218    *    ab|a|x|abcd|abc
3219    * when matched against the string "abcde", will generate
3220    * accept states for all words except 3, with the longest
3221    * matching word being 4, and the shortest being 1 (with
3222    * the position being after char 1 of the string).
3223    *
3224    * Then for each matching word, in word order (i.e. 1,2,4,5),
3225    * we run the remainder of the pattern; on each try setting
3226    * the current position to the character following the word,
3227    * returning to try the next word on failure.
3228    *
3229    * We avoid having to build a list of words at runtime by
3230    * using a compile-time structure, wordinfo[].prev, which
3231    * gives, for each word, the previous accepting word (if any).
3232    * In the case above it would contain the mappings 1->2, 2->0,
3233    * 3->0, 4->5, 5->1.  We can use this table to generate, from
3234    * the longest word (4 above), a list of all words, by
3235    * following the list of prev pointers; this gives us the
3236    * unordered list 4,5,1,2. Then given the current word we have
3237    * just tried, we can go through the list and find the
3238    * next-biggest word to try (so if we just failed on word 2,
3239    * the next in the list is 4).
3240    *
3241    * Since at runtime we don't record the matching position in
3242    * the string for each word, we have to work that out for
3243    * each word we're about to process. The wordinfo table holds
3244    * the character length of each word; given that we recorded
3245    * at the start: the position of the shortest word and its
3246    * length in chars, we just need to move the pointer the
3247    * difference between the two char lengths. Depending on
3248    * Unicode status and folding, that's cheap or expensive.
3249    *
3250    * This algorithm is optimised for the case where are only a
3251    * small number of accept states, i.e. 0,1, or maybe 2.
3252    * With lots of accepts states, and having to try all of them,
3253    * it becomes quadratic on number of accept states to find all
3254    * the next words.
3255    */
3256
3257    {
3258     /* what type of TRIE am I? (utf8 makes this contextual) */
3259     DECL_TRIE_TYPE(scan);
3260
3261     /* what trie are we using right now */
3262     reg_trie_data * const trie
3263      = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3264     HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3265     U32 state = trie->startstate;
3266
3267     if (trie->bitmap && trie_type != trie_utf8_fold &&
3268      !TRIE_BITMAP_TEST(trie,*locinput)
3269     ) {
3270      if (trie->states[ state ].wordnum) {
3271       DEBUG_EXECUTE_r(
3272        PerlIO_printf(Perl_debug_log,
3273           "%*s  %smatched empty string...%s\n",
3274           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3275       );
3276       if (!trie->jump)
3277        break;
3278      } else {
3279       DEBUG_EXECUTE_r(
3280        PerlIO_printf(Perl_debug_log,
3281           "%*s  %sfailed to match trie start class...%s\n",
3282           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3283       );
3284       sayNO_SILENT;
3285     }
3286     }
3287
3288    {
3289     U8 *uc = ( U8* )locinput;
3290
3291     STRLEN len = 0;
3292     STRLEN foldlen = 0;
3293     U8 *uscan = (U8*)NULL;
3294     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3295     U32 charcount = 0; /* how many input chars we have matched */
3296     U32 accepted = 0; /* have we seen any accepting states? */
3297
3298     ST.B = next;
3299     ST.jump = trie->jump;
3300     ST.me = scan;
3301     ST.firstpos = NULL;
3302     ST.longfold = FALSE; /* char longer if folded => it's harder */
3303     ST.nextword = 0;
3304
3305     /* fully traverse the TRIE; note the position of the
3306     shortest accept state and the wordnum of the longest
3307     accept state */
3308
3309     while ( state && uc <= (U8*)PL_regeol ) {
3310      U32 base = trie->states[ state ].trans.base;
3311      UV uvc = 0;
3312      U16 charid = 0;
3313      U16 wordnum;
3314      wordnum = trie->states[ state ].wordnum;
3315
3316      if (wordnum) { /* it's an accept state */
3317       if (!accepted) {
3318        accepted = 1;
3319        /* record first match position */
3320        if (ST.longfold) {
3321         ST.firstpos = (U8*)locinput;
3322         ST.firstchars = 0;
3323        }
3324        else {
3325         ST.firstpos = uc;
3326         ST.firstchars = charcount;
3327        }
3328       }
3329       if (!ST.nextword || wordnum < ST.nextword)
3330        ST.nextword = wordnum;
3331       ST.topword = wordnum;
3332      }
3333
3334      DEBUG_TRIE_EXECUTE_r({
3335         DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3336         PerlIO_printf( Perl_debug_log,
3337          "%*s  %sState: %4"UVxf" Accepted: %c ",
3338          2+depth * 2, "", PL_colors[4],
3339          (UV)state, (accepted ? 'Y' : 'N'));
3340      });
3341
3342      /* read a char and goto next state */
3343      if ( base ) {
3344       I32 offset;
3345       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3346            uscan, len, uvc, charid, foldlen,
3347            foldbuf, uniflags);
3348       charcount++;
3349       if (foldlen>0)
3350        ST.longfold = TRUE;
3351       if (charid &&
3352        ( ((offset =
3353        base + charid - 1 - trie->uniquecharcount)) >= 0)
3354
3355        && ((U32)offset < trie->lasttrans)
3356        && trie->trans[offset].check == state)
3357       {
3358        state = trie->trans[offset].next;
3359       }
3360       else {
3361        state = 0;
3362       }
3363       uc += len;
3364
3365      }
3366      else {
3367       state = 0;
3368      }
3369      DEBUG_TRIE_EXECUTE_r(
3370       PerlIO_printf( Perl_debug_log,
3371        "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3372        charid, uvc, (UV)state, PL_colors[5] );
3373      );
3374     }
3375     if (!accepted)
3376     sayNO;
3377
3378     /* calculate total number of accept states */
3379     {
3380      U16 w = ST.topword;
3381      accepted = 0;
3382      while (w) {
3383       w = trie->wordinfo[w].prev;
3384       accepted++;
3385      }
3386      ST.accepted = accepted;
3387     }
3388
3389     DEBUG_EXECUTE_r(
3390      PerlIO_printf( Perl_debug_log,
3391       "%*s  %sgot %"IVdf" possible matches%s\n",
3392       REPORT_CODE_OFF + depth * 2, "",
3393       PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3394     );
3395     goto trie_first_try; /* jump into the fail handler */
3396    }}
3397    /* NOTREACHED */
3398
3399   case TRIE_next_fail: /* we failed - try next alternative */
3400    if ( ST.jump) {
3401     REGCP_UNWIND(ST.cp);
3402     for (n = *PL_reglastparen; n > ST.lastparen; n--)
3403      PL_regoffs[n].end = -1;
3404     *PL_reglastparen = n;
3405    }
3406    if (!--ST.accepted) {
3407     DEBUG_EXECUTE_r({
3408      PerlIO_printf( Perl_debug_log,
3409       "%*s  %sTRIE failed...%s\n",
3410       REPORT_CODE_OFF+depth*2, "",
3411       PL_colors[4],
3412       PL_colors[5] );
3413     });
3414     sayNO_SILENT;
3415    }
3416    {
3417     /* Find next-highest word to process.  Note that this code
3418     * is O(N^2) per trie run (O(N) per branch), so keep tight */
3419     register U16 min = 0;
3420     register U16 word;
3421     register U16 const nextword = ST.nextword;
3422     register reg_trie_wordinfo * const wordinfo
3423      = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3424     for (word=ST.topword; word; word=wordinfo[word].prev) {
3425      if (word > nextword && (!min || word < min))
3426       min = word;
3427     }
3428     ST.nextword = min;
3429    }
3430
3431   trie_first_try:
3432    if (do_cutgroup) {
3433     do_cutgroup = 0;
3434     no_final = 0;
3435    }
3436
3437    if ( ST.jump) {
3438     ST.lastparen = *PL_reglastparen;
3439     REGCP_SET(ST.cp);
3440    }
3441
3442    /* find start char of end of current word */
3443    {
3444     U32 chars; /* how many chars to skip */
3445     U8 *uc = ST.firstpos;
3446     reg_trie_data * const trie
3447      = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3448
3449     assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3450        >=  ST.firstchars);
3451     chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3452        - ST.firstchars;
3453
3454     if (ST.longfold) {
3455      /* the hard option - fold each char in turn and find
3456      * its folded length (which may be different */
3457      U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3458      STRLEN foldlen;
3459      STRLEN len;
3460      UV uvc;
3461      U8 *uscan;
3462
3463      while (chars) {
3464       if (utf8_target) {
3465        uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3466              uniflags);
3467        uc += len;
3468       }
3469       else {
3470        uvc = *uc;
3471        uc++;
3472       }
3473       uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3474       uscan = foldbuf;
3475       while (foldlen) {
3476        if (!--chars)
3477         break;
3478        uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3479            uniflags);
3480        uscan += len;
3481        foldlen -= len;
3482       }
3483      }
3484     }
3485     else {
3486      if (utf8_target)
3487       while (chars--)
3488        uc += UTF8SKIP(uc);
3489      else
3490       uc += chars;
3491     }
3492     PL_reginput = (char *)uc;
3493    }
3494
3495    scan = (ST.jump && ST.jump[ST.nextword])
3496       ? ST.me + ST.jump[ST.nextword]
3497       : ST.B;
3498
3499    DEBUG_EXECUTE_r({
3500     PerlIO_printf( Perl_debug_log,
3501      "%*s  %sTRIE matched word #%d, continuing%s\n",
3502      REPORT_CODE_OFF+depth*2, "",
3503      PL_colors[4],
3504      ST.nextword,
3505      PL_colors[5]
3506      );
3507    });
3508
3509    if (ST.accepted > 1 || has_cutgroup) {
3510     PUSH_STATE_GOTO(TRIE_next, scan);
3511     /* NOTREACHED */
3512    }
3513    /* only one choice left - just continue */
3514    DEBUG_EXECUTE_r({
3515     AV *const trie_words
3516      = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3517     SV ** const tmp = av_fetch( trie_words,
3518      ST.nextword-1, 0 );
3519     SV *sv= tmp ? sv_newmortal() : NULL;
3520
3521     PerlIO_printf( Perl_debug_log,
3522      "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
3523      REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3524      ST.nextword,
3525      tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3526        PL_colors[0], PL_colors[1],
3527        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3528       )
3529      : "not compiled under -Dr",
3530      PL_colors[5] );
3531    });
3532
3533    locinput = PL_reginput;
3534    nextchr = UCHARAT(locinput);
3535    continue; /* execute rest of RE */
3536    /* NOTREACHED */
3537 #undef  ST
3538
3539   case EXACT: {
3540    char *s = STRING(scan);
3541    ln = STR_LEN(scan);
3542    if (utf8_target != UTF_PATTERN) {
3543     /* The target and the pattern have differing utf8ness. */
3544     char *l = locinput;
3545     const char * const e = s + ln;
3546
3547     if (utf8_target) {
3548      /* The target is utf8, the pattern is not utf8. */
3549      while (s < e) {
3550       STRLEN ulen;
3551       if (l >= PL_regeol)
3552        sayNO;
3553       if (NATIVE_TO_UNI(*(U8*)s) !=
3554        utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3555            uniflags))
3556        sayNO;
3557       l += ulen;
3558       s ++;
3559      }
3560     }
3561     else {
3562      /* The target is not utf8, the pattern is utf8. */
3563      while (s < e) {
3564       STRLEN ulen;
3565       if (l >= PL_regeol)
3566        sayNO;
3567       if (NATIVE_TO_UNI(*((U8*)l)) !=
3568        utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3569           uniflags))
3570        sayNO;
3571       s += ulen;
3572       l ++;
3573      }
3574     }
3575     locinput = l;
3576     nextchr = UCHARAT(locinput);
3577     break;
3578    }
3579    /* The target and the pattern have the same utf8ness. */
3580    /* Inline the first character, for speed. */
3581    if (UCHARAT(s) != nextchr)
3582     sayNO;
3583    if (PL_regeol - locinput < ln)
3584     sayNO;
3585    if (ln > 1 && memNE(s, locinput, ln))
3586     sayNO;
3587    locinput += ln;
3588    nextchr = UCHARAT(locinput);
3589    break;
3590    }
3591   case EXACTFL: {
3592    re_fold_t folder;
3593    const U8 * fold_array;
3594    const char * s;
3595    U32 fold_utf8_flags;
3596
3597    PL_reg_flags |= RF_tainted;
3598    folder = foldEQ_locale;
3599    fold_array = PL_fold_locale;
3600    fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3601    goto do_exactf;
3602
3603   case EXACTFU:
3604    folder = foldEQ_latin1;
3605    fold_array = PL_fold_latin1;
3606    fold_utf8_flags = 0;
3607    goto do_exactf;
3608
3609   case EXACTFA:
3610    folder = foldEQ_latin1;
3611    fold_array = PL_fold_latin1;
3612    fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3613    goto do_exactf;
3614
3615   case EXACTF:
3616    folder = foldEQ;
3617    fold_array = PL_fold;
3618    fold_utf8_flags = 0;
3619
3620   do_exactf:
3621    s = STRING(scan);
3622    ln = STR_LEN(scan);
3623
3624    if (utf8_target || UTF_PATTERN) {
3625    /* Either target or the pattern are utf8. */
3626     const char * const l = locinput;
3627     char *e = PL_regeol;
3628
3629     if (! foldEQ_utf8_flags(s, 0,  ln, cBOOL(UTF_PATTERN),
3630        l, &e, 0,  utf8_target, fold_utf8_flags))
3631     {
3632      sayNO;
3633     }
3634     locinput = e;
3635     nextchr = UCHARAT(locinput);
3636     break;
3637    }
3638
3639    /* Neither the target nor the pattern are utf8 */
3640    if (UCHARAT(s) != nextchr &&
3641     UCHARAT(s) != fold_array[nextchr])
3642    {
3643     sayNO;
3644    }
3645    if (PL_regeol - locinput < ln)
3646     sayNO;
3647    if (ln > 1 && ! folder(s, locinput, ln))
3648     sayNO;
3649    locinput += ln;
3650    nextchr = UCHARAT(locinput);
3651    break;
3652   }
3653
3654   /* XXX Could improve efficiency by separating these all out using a
3655   * macro or in-line function.  At that point regcomp.c would no longer
3656   * have to set the FLAGS fields of these */
3657   case BOUNDL:
3658   case NBOUNDL:
3659    PL_reg_flags |= RF_tainted;
3660    /* FALL THROUGH */
3661   case BOUND:
3662   case BOUNDU:
3663   case BOUNDA:
3664   case NBOUND:
3665   case NBOUNDU:
3666   case NBOUNDA:
3667    /* was last char in word? */
3668    if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) {
3669     if (locinput == PL_bostr)
3670      ln = '\n';
3671     else {
3672      const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3673
3674      ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3675     }
3676     if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3677      ln = isALNUM_uni(ln);
3678      LOAD_UTF8_CHARCLASS_ALNUM();
3679      n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3680     }
3681     else {
3682      ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3683      n = isALNUM_LC_utf8((U8*)locinput);
3684     }
3685    }
3686    else {
3687
3688     /* Here the string isn't utf8, or is utf8 and only ascii
3689     * characters are to match \w.  In the latter case looking at
3690     * the byte just prior to the current one may be just the final
3691     * byte of a multi-byte character.  This is ok.  There are two
3692     * cases:
3693     * 1) it is a single byte character, and then the test is doing
3694     * just what it's supposed to.
3695     * 2) it is a multi-byte character, in which case the final
3696     * byte is never mistakable for ASCII, and so the test
3697     * will say it is not a word character, which is the
3698     * correct answer. */
3699     ln = (locinput != PL_bostr) ?
3700      UCHARAT(locinput - 1) : '\n';
3701     switch (FLAGS(scan)) {
3702      case REGEX_UNICODE_CHARSET:
3703       ln = isWORDCHAR_L1(ln);
3704       n = isWORDCHAR_L1(nextchr);
3705       break;
3706      case REGEX_LOCALE_CHARSET:
3707       ln = isALNUM_LC(ln);
3708       n = isALNUM_LC(nextchr);
3709       break;
3710      case REGEX_DEPENDS_CHARSET:
3711       ln = isALNUM(ln);
3712       n = isALNUM(nextchr);
3713       break;
3714      case REGEX_ASCII_RESTRICTED_CHARSET:
3715       ln = isWORDCHAR_A(ln);
3716       n = isWORDCHAR_A(nextchr);
3717       break;
3718      default:
3719       Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3720       break;
3721     }
3722    }
3723    /* Note requires that all BOUNDs be lower than all NBOUNDs in
3724    * regcomp.sym */
3725    if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3726      sayNO;
3727    break;
3728   case ANYOFV:
3729   case ANYOF:
3730    if (utf8_target || state_num == ANYOFV) {
3731     STRLEN inclasslen = PL_regeol - locinput;
3732     if (locinput >= PL_regeol)
3733      sayNO;
3734
3735     if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3736      sayNO;
3737     locinput += inclasslen;
3738     nextchr = UCHARAT(locinput);
3739     break;
3740    }
3741    else {
3742     if (nextchr < 0)
3743      nextchr = UCHARAT(locinput);
3744     if (!nextchr && locinput >= PL_regeol)
3745      sayNO;
3746     if (!REGINCLASS(rex, scan, (U8*)locinput))
3747      sayNO;
3748     nextchr = UCHARAT(++locinput);
3749     break;
3750    }
3751    break;
3752   /* Special char classes - The defines start on line 129 or so */
3753   CCC_TRY_U(ALNUM,  NALNUM,  isWORDCHAR,
3754     ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3755     ALNUMU, NALNUMU, isWORDCHAR_L1,
3756     ALNUMA, NALNUMA, isWORDCHAR_A,
3757     alnum, "a");
3758
3759   CCC_TRY_U(SPACE,  NSPACE,  isSPACE,
3760     SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3761     SPACEU, NSPACEU, isSPACE_L1,
3762     SPACEA, NSPACEA, isSPACE_A,
3763     space, " ");
3764
3765   CCC_TRY(DIGIT,  NDIGIT,  isDIGIT,
3766     DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3767     DIGITA, NDIGITA, isDIGIT_A,
3768     digit, "0");
3769
3770   case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3771      a Unicode extended Grapheme Cluster */
3772    /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3773    extended Grapheme Cluster is:
3774
3775    CR LF
3776    | Prepend* Begin Extend*
3777    | .
3778
3779    Begin is (Hangul-syllable | ! Control)
3780    Extend is (Grapheme_Extend | Spacing_Mark)
3781    Control is [ GCB_Control CR LF ]
3782
3783    The discussion below shows how the code for CLUMP is derived
3784    from this regex.  Note that most of these concepts are from
3785    property values of the Grapheme Cluster Boundary (GCB) property.
3786    No code point can have multiple property values for a given
3787    property.  Thus a code point in Prepend can't be in Control, but
3788    it must be in !Control.  This is why Control above includes
3789    GCB_Control plus CR plus LF.  The latter two are used in the GCB
3790    property separately, and so can't be in GCB_Control, even though
3791    they logically are controls.  Control is not the same as gc=cc,
3792    but includes format and other characters as well.
3793
3794    The Unicode definition of Hangul-syllable is:
3795     L+
3796     | (L* ( ( V | LV ) V* | LVT ) T*)
3797     | T+
3798     )
3799    Each of these is a value for the GCB property, and hence must be
3800    disjoint, so the order they are tested is immaterial, so the
3801    above can safely be changed to
3802     T+
3803     | L+
3804     | (L* ( LVT | ( V | LV ) V*) T*)
3805
3806    The last two terms can be combined like this:
3807     L* ( L
3808       | (( LVT | ( V | LV ) V*) T*))
3809
3810    And refactored into this:
3811     L* (L | LVT T* | V  V* T* | LV  V* T*)
3812
3813    That means that if we have seen any L's at all we can quit
3814    there, but if the next character is a LVT, a V or and LV we
3815    should keep going.
3816
3817    There is a subtlety with Prepend* which showed up in testing.
3818    Note that the Begin, and only the Begin is required in:
3819     | Prepend* Begin Extend*
3820    Also, Begin contains '! Control'.  A Prepend must be a '!
3821    Control', which means it must be a Begin.  What it comes down to
3822    is that if we match Prepend* and then find no suitable Begin
3823    afterwards, that if we backtrack the last Prepend, that one will
3824    be a suitable Begin.
3825    */
3826
3827    if (locinput >= PL_regeol)
3828     sayNO;
3829    if  (! utf8_target) {
3830
3831     /* Match either CR LF  or '.', as all the other possibilities
3832     * require utf8 */
3833     locinput++;     /* Match the . or CR */
3834     if (nextchr == '\r'
3835      && locinput < PL_regeol
3836      && UCHARAT(locinput) == '\n') locinput++;
3837    }
3838    else {
3839
3840     /* Utf8: See if is ( CR LF ); already know that locinput <
3841     * PL_regeol, so locinput+1 is in bounds */
3842     if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3843      locinput += 2;
3844     }
3845     else {
3846      /* In case have to backtrack to beginning, then match '.' */
3847      char *starting = locinput;
3848
3849      /* In case have to backtrack the last prepend */
3850      char *previous_prepend = 0;
3851
3852      LOAD_UTF8_CHARCLASS_GCB();
3853
3854      /* Match (prepend)* */
3855      while (locinput < PL_regeol
3856       && swash_fetch(PL_utf8_X_prepend,
3857           (U8*)locinput, utf8_target))
3858      {
3859       previous_prepend = locinput;
3860       locinput += UTF8SKIP(locinput);
3861      }
3862
3863      /* As noted above, if we matched a prepend character, but
3864      * the next thing won't match, back off the last prepend we
3865      * matched, as it is guaranteed to match the begin */
3866      if (previous_prepend
3867       && (locinput >=  PL_regeol
3868        || ! swash_fetch(PL_utf8_X_begin,
3869            (U8*)locinput, utf8_target)))
3870      {
3871       locinput = previous_prepend;
3872      }
3873
3874      /* Note that here we know PL_regeol > locinput, as we
3875      * tested that upon input to this switch case, and if we
3876      * moved locinput forward, we tested the result just above
3877      * and it either passed, or we backed off so that it will
3878      * now pass */
3879      if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3880
3881       /* Here did not match the required 'Begin' in the
3882       * second term.  So just match the very first
3883       * character, the '.' of the final term of the regex */
3884       locinput = starting + UTF8SKIP(starting);
3885      } else {
3886
3887       /* Here is the beginning of a character that can have
3888       * an extender.  It is either a hangul syllable, or a
3889       * non-control */
3890       if (swash_fetch(PL_utf8_X_non_hangul,
3891           (U8*)locinput, utf8_target))
3892       {
3893
3894        /* Here not a Hangul syllable, must be a
3895        * ('!  * Control') */
3896        locinput += UTF8SKIP(locinput);
3897       } else {
3898
3899        /* Here is a Hangul syllable.  It can be composed
3900        * of several individual characters.  One
3901        * possibility is T+ */
3902        if (swash_fetch(PL_utf8_X_T,
3903            (U8*)locinput, utf8_target))
3904        {
3905         while (locinput < PL_regeol
3906           && swash_fetch(PL_utf8_X_T,
3907               (U8*)locinput, utf8_target))
3908         {
3909          locinput += UTF8SKIP(locinput);
3910         }
3911        } else {
3912
3913         /* Here, not T+, but is a Hangul.  That means
3914         * it is one of the others: L, LV, LVT or V,
3915         * and matches:
3916         * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3917
3918         /* Match L*           */
3919         while (locinput < PL_regeol
3920           && swash_fetch(PL_utf8_X_L,
3921               (U8*)locinput, utf8_target))
3922         {
3923          locinput += UTF8SKIP(locinput);
3924         }
3925
3926         /* Here, have exhausted L*.  If the next
3927         * character is not an LV, LVT nor V, it means
3928         * we had to have at least one L, so matches L+
3929         * in the original equation, we have a complete
3930         * hangul syllable.  Are done. */
3931
3932         if (locinput < PL_regeol
3933          && swash_fetch(PL_utf8_X_LV_LVT_V,
3934              (U8*)locinput, utf8_target))
3935         {
3936
3937          /* Otherwise keep going.  Must be LV, LVT
3938          * or V.  See if LVT */
3939          if (swash_fetch(PL_utf8_X_LVT,
3940              (U8*)locinput, utf8_target))
3941          {
3942           locinput += UTF8SKIP(locinput);
3943          } else {
3944
3945           /* Must be  V or LV.  Take it, then
3946           * match V*     */
3947           locinput += UTF8SKIP(locinput);
3948           while (locinput < PL_regeol
3949             && swash_fetch(PL_utf8_X_V,
3950               (U8*)locinput, utf8_target))
3951           {
3952            locinput += UTF8SKIP(locinput);
3953           }
3954          }
3955
3956          /* And any of LV, LVT, or V can be followed
3957          * by T*            */
3958          while (locinput < PL_regeol
3959           && swash_fetch(PL_utf8_X_T,
3960               (U8*)locinput,
3961               utf8_target))
3962          {
3963           locinput += UTF8SKIP(locinput);
3964          }
3965         }
3966        }
3967       }
3968
3969       /* Match any extender */
3970       while (locinput < PL_regeol
3971         && swash_fetch(PL_utf8_X_extend,
3972             (U8*)locinput, utf8_target))
3973       {
3974        locinput += UTF8SKIP(locinput);
3975       }
3976      }
3977     }
3978     if (locinput > PL_regeol) sayNO;
3979    }
3980    nextchr = UCHARAT(locinput);
3981    break;
3982
3983   case NREFFL:
3984   {   /* The capture buffer cases.  The ones beginning with N for the
3985    named buffers just convert to the equivalent numbered and
3986    pretend they were called as the corresponding numbered buffer
3987    op.  */
3988    /* don't initialize these in the declaration, it makes C++
3989    unhappy */
3990    char *s;
3991    char type;
3992    re_fold_t folder;
3993    const U8 *fold_array;
3994    UV utf8_fold_flags;
3995
3996    PL_reg_flags |= RF_tainted;
3997    folder = foldEQ_locale;
3998    fold_array = PL_fold_locale;
3999    type = REFFL;
4000    utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4001    goto do_nref;
4002
4003   case NREFFA:
4004    folder = foldEQ_latin1;
4005    fold_array = PL_fold_latin1;
4006    type = REFFA;
4007    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4008    goto do_nref;
4009
4010   case NREFFU:
4011    folder = foldEQ_latin1;
4012    fold_array = PL_fold_latin1;
4013    type = REFFU;
4014    utf8_fold_flags = 0;
4015    goto do_nref;
4016
4017   case NREFF:
4018    folder = foldEQ;
4019    fold_array = PL_fold;
4020    type = REFF;
4021    utf8_fold_flags = 0;
4022    goto do_nref;
4023
4024   case NREF:
4025    type = REF;
4026    folder = NULL;
4027    fold_array = NULL;
4028    utf8_fold_flags = 0;
4029   do_nref:
4030
4031    /* For the named back references, find the corresponding buffer
4032    * number */
4033    n = reg_check_named_buff_matched(rex,scan);
4034
4035    if ( ! n ) {
4036     sayNO;
4037    }
4038    goto do_nref_ref_common;
4039
4040   case REFFL:
4041    PL_reg_flags |= RF_tainted;
4042    folder = foldEQ_locale;
4043    fold_array = PL_fold_locale;
4044    utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4045    goto do_ref;
4046
4047   case REFFA:
4048    folder = foldEQ_latin1;
4049    fold_array = PL_fold_latin1;
4050    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4051    goto do_ref;
4052
4053   case REFFU:
4054    folder = foldEQ_latin1;
4055    fold_array = PL_fold_latin1;
4056    utf8_fold_flags = 0;
4057    goto do_ref;
4058
4059   case REFF:
4060    folder = foldEQ;
4061    fold_array = PL_fold;
4062    utf8_fold_flags = 0;
4063    goto do_ref;
4064
4065   case REF:
4066    folder = NULL;
4067    fold_array = NULL;
4068    utf8_fold_flags = 0;
4069
4070   do_ref:
4071    type = OP(scan);
4072    n = ARG(scan);  /* which paren pair */
4073
4074   do_nref_ref_common:
4075    ln = PL_regoffs[n].start;
4076    PL_reg_leftiter = PL_reg_maxiter;  /* Void cache */
4077    if (*PL_reglastparen < n || ln == -1)
4078     sayNO;   /* Do not match unless seen CLOSEn. */
4079    if (ln == PL_regoffs[n].end)
4080     break;
4081
4082    s = PL_bostr + ln;
4083    if (type != REF /* REF can do byte comparison */
4084     && (utf8_target || type == REFFU))
4085    { /* XXX handle REFFL better */
4086     char * limit = PL_regeol;
4087
4088     /* This call case insensitively compares the entire buffer
4089      * at s, with the current input starting at locinput, but
4090      * not going off the end given by PL_regeol, and returns in
4091      * limit upon success, how much of the current input was
4092      * matched */
4093     if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target,
4094          locinput, &limit, 0, utf8_target, utf8_fold_flags))
4095     {
4096      sayNO;
4097     }
4098     locinput = limit;
4099     nextchr = UCHARAT(locinput);
4100     break;
4101    }
4102
4103    /* Not utf8:  Inline the first character, for speed. */
4104    if (UCHARAT(s) != nextchr &&
4105     (type == REF ||
4106     UCHARAT(s) != fold_array[nextchr]))
4107     sayNO;
4108    ln = PL_regoffs[n].end - ln;
4109    if (locinput + ln > PL_regeol)
4110     sayNO;
4111    if (ln > 1 && (type == REF
4112       ? memNE(s, locinput, ln)
4113       : ! folder(s, locinput, ln)))
4114     sayNO;
4115    locinput += ln;
4116    nextchr = UCHARAT(locinput);
4117    break;
4118   }
4119   case NOTHING:
4120   case TAIL:
4121    break;
4122   case BACK:
4123    break;
4124
4125 #undef  ST
4126 #define ST st->u.eval
4127   {
4128    SV *ret;
4129    REGEXP *re_sv;
4130    regexp *re;
4131    regexp_internal *rei;
4132    regnode *startpoint;
4133
4134   case GOSTART:
4135   case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4136    if (cur_eval && cur_eval->locinput==locinput) {
4137     if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4138      Perl_croak(aTHX_ "Infinite recursion in regex");
4139     if ( ++nochange_depth > max_nochange_depth )
4140      Perl_croak(aTHX_
4141       "Pattern subroutine nesting without pos change"
4142       " exceeded limit in regex");
4143    } else {
4144     nochange_depth = 0;
4145    }
4146    re_sv = rex_sv;
4147    re = rex;
4148    rei = rexi;
4149    (void)ReREFCNT_inc(rex_sv);
4150    if (OP(scan)==GOSUB) {
4151     startpoint = scan + ARG2L(scan);
4152     ST.close_paren = ARG(scan);
4153    } else {
4154     startpoint = rei->program+1;
4155     ST.close_paren = 0;
4156    }
4157    goto eval_recurse_doit;
4158    /* NOTREACHED */
4159   case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4160    if (cur_eval && cur_eval->locinput==locinput) {
4161     if ( ++nochange_depth > max_nochange_depth )
4162      Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4163    } else {
4164     nochange_depth = 0;
4165    }
4166    {
4167     /* execute the code in the {...} */
4168     dSP;
4169     SV ** const before = SP;
4170     OP_4tree * const oop = PL_op;
4171     COP * const ocurcop = PL_curcop;
4172     PAD *old_comppad;
4173     char *saved_regeol = PL_regeol;
4174     struct re_save_state saved_state;
4175
4176     /* To not corrupt the existing regex state while executing the
4177     * eval we would normally put it on the save stack, like with
4178     * save_re_context. However, re-evals have a weird scoping so we
4179     * can't just add ENTER/LEAVE here. With that, things like
4180     *
4181     *    (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4182     *
4183     * would break, as they expect the localisation to be unwound
4184     * only when the re-engine backtracks through the bit that
4185     * localised it.
4186     *
4187     * What we do instead is just saving the state in a local c
4188     * variable.
4189     */
4190     Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4191
4192     n = ARG(scan);
4193     PL_op = (OP_4tree*)rexi->data->data[n];
4194     DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4195      "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4196     PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4197     PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4198
4199     if (sv_yes_mark) {
4200      SV *sv_mrk = get_sv("REGMARK", 1);
4201      sv_setsv(sv_mrk, sv_yes_mark);
4202     }
4203
4204     CALLRUNOPS(aTHX);   /* Scalar context. */
4205     SPAGAIN;
4206     if (SP == before)
4207      ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4208     else {
4209      ret = POPs;
4210      PUTBACK;
4211     }
4212
4213     Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4214
4215     PL_op = oop;
4216     PAD_RESTORE_LOCAL(old_comppad);
4217     PL_curcop = ocurcop;
4218     PL_regeol = saved_regeol;
4219     if (!logical) {
4220      /* /(?{...})/ */
4221      sv_setsv(save_scalar(PL_replgv), ret);
4222      break;
4223     }
4224    }
4225    if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4226     logical = 0;
4227     {
4228      /* extract RE object from returned value; compiling if
4229      * necessary */
4230      MAGIC *mg = NULL;
4231      REGEXP *rx = NULL;
4232
4233      if (SvROK(ret)) {
4234       SV *const sv = SvRV(ret);
4235
4236       if (SvTYPE(sv) == SVt_REGEXP) {
4237        rx = (REGEXP*) sv;
4238       } else if (SvSMAGICAL(sv)) {
4239        mg = mg_find(sv, PERL_MAGIC_qr);
4240        assert(mg);
4241       }
4242      } else if (SvTYPE(ret) == SVt_REGEXP) {
4243       rx = (REGEXP*) ret;
4244      } else if (SvSMAGICAL(ret)) {
4245       if (SvGMAGICAL(ret)) {
4246        /* I don't believe that there is ever qr magic
4247        here.  */
4248        assert(!mg_find(ret, PERL_MAGIC_qr));
4249        sv_unmagic(ret, PERL_MAGIC_qr);
4250       }
4251       else {
4252        mg = mg_find(ret, PERL_MAGIC_qr);
4253        /* testing suggests mg only ends up non-NULL for
4254        scalars who were upgraded and compiled in the
4255        else block below. In turn, this is only
4256        triggered in the "postponed utf8 string" tests
4257        in t/op/pat.t  */
4258       }
4259      }
4260
4261      if (mg) {
4262       rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4263       assert(rx);
4264      }
4265      if (rx) {
4266       rx = reg_temp_copy(NULL, rx);
4267      }
4268      else {
4269       U32 pm_flags = 0;
4270       const I32 osize = PL_regsize;
4271
4272       if (DO_UTF8(ret)) {
4273        assert (SvUTF8(ret));
4274       } else if (SvUTF8(ret)) {
4275        /* Not doing UTF-8, despite what the SV says. Is
4276        this only if we're trapped in use 'bytes'?  */
4277        /* Make a copy of the octet sequence, but without
4278        the flag on, as the compiler now honours the
4279        SvUTF8 flag on ret.  */
4280        STRLEN len;
4281        const char *const p = SvPV(ret, len);
4282        ret = newSVpvn_flags(p, len, SVs_TEMP);
4283       }
4284       rx = CALLREGCOMP(ret, pm_flags);
4285       if (!(SvFLAGS(ret)
4286        & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4287         | SVs_GMG))) {
4288        /* This isn't a first class regexp. Instead, it's
4289        caching a regexp onto an existing, Perl visible
4290        scalar.  */
4291        sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4292       }
4293       PL_regsize = osize;
4294      }
4295      re_sv = rx;
4296      re = (struct regexp *)SvANY(rx);
4297     }
4298     RXp_MATCH_COPIED_off(re);
4299     re->subbeg = rex->subbeg;
4300     re->sublen = rex->sublen;
4301     rei = RXi_GET(re);
4302     DEBUG_EXECUTE_r(
4303      debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4304       "Matching embedded");
4305     );
4306     startpoint = rei->program + 1;
4307      ST.close_paren = 0; /* only used for GOSUB */
4308      /* borrowed from regtry */
4309     if (PL_reg_start_tmpl <= re->nparens) {
4310      PL_reg_start_tmpl = re->nparens*3/2 + 3;
4311      if(PL_reg_start_tmp)
4312       Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4313      else
4314       Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4315     }
4316
4317   eval_recurse_doit: /* Share code with GOSUB below this line */
4318     /* run the pattern returned from (??{...}) */
4319     ST.cp = regcppush(0); /* Save *all* the positions. */
4320     REGCP_SET(ST.lastcp);
4321
4322     PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4323
4324     /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4325     PL_reglastparen = &re->lastparen;
4326     PL_reglastcloseparen = &re->lastcloseparen;
4327     re->lastparen = 0;
4328     re->lastcloseparen = 0;
4329
4330     PL_reginput = locinput;
4331     PL_regsize = 0;
4332
4333     /* XXXX This is too dramatic a measure... */
4334     PL_reg_maxiter = 0;
4335
4336     ST.toggle_reg_flags = PL_reg_flags;
4337     if (RX_UTF8(re_sv))
4338      PL_reg_flags |= RF_utf8;
4339     else
4340      PL_reg_flags &= ~RF_utf8;
4341     ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4342
4343     ST.prev_rex = rex_sv;
4344     ST.prev_curlyx = cur_curlyx;
4345     SETREX(rex_sv,re_sv);
4346     rex = re;
4347     rexi = rei;
4348     cur_curlyx = NULL;
4349     ST.B = next;
4350     ST.prev_eval = cur_eval;
4351     cur_eval = st;
4352     /* now continue from first node in postoned RE */
4353     PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4354     /* NOTREACHED */
4355    }
4356    /* logical is 1,   /(?(?{...})X|Y)/ */
4357    sw = cBOOL(SvTRUE(ret));
4358    logical = 0;
4359    break;
4360   }
4361
4362   case EVAL_AB: /* cleanup after a successful (??{A})B */
4363    /* note: this is called twice; first after popping B, then A */
4364    PL_reg_flags ^= ST.toggle_reg_flags;
4365    ReREFCNT_dec(rex_sv);
4366    SETREX(rex_sv,ST.prev_rex);
4367    rex = (struct regexp *)SvANY(rex_sv);
4368    rexi = RXi_GET(rex);
4369    regcpblow(ST.cp);
4370    cur_eval = ST.prev_eval;
4371    cur_curlyx = ST.prev_curlyx;
4372
4373    /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4374    PL_reglastparen = &rex->lastparen;
4375    PL_reglastcloseparen = &rex->lastcloseparen;
4376    /* also update PL_regoffs */
4377    PL_regoffs = rex->offs;
4378
4379    /* XXXX This is too dramatic a measure... */
4380    PL_reg_maxiter = 0;
4381    if ( nochange_depth )
4382     nochange_depth--;
4383    sayYES;
4384
4385
4386   case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4387    /* note: this is called twice; first after popping B, then A */
4388    PL_reg_flags ^= ST.toggle_reg_flags;
4389    ReREFCNT_dec(rex_sv);
4390    SETREX(rex_sv,ST.prev_rex);
4391    rex = (struct regexp *)SvANY(rex_sv);
4392    rexi = RXi_GET(rex);
4393    /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4394    PL_reglastparen = &rex->lastparen;
4395    PL_reglastcloseparen = &rex->lastcloseparen;
4396
4397    PL_reginput = locinput;
4398    REGCP_UNWIND(ST.lastcp);
4399    regcppop(rex);
4400    cur_eval = ST.prev_eval;
4401    cur_curlyx = ST.prev_curlyx;
4402    /* XXXX This is too dramatic a measure... */
4403    PL_reg_maxiter = 0;
4404    if ( nochange_depth )
4405     nochange_depth--;
4406    sayNO_SILENT;
4407 #undef ST
4408
4409   case OPEN:
4410    n = ARG(scan);  /* which paren pair */
4411    PL_reg_start_tmp[n] = locinput;
4412    if (n > PL_regsize)
4413     PL_regsize = n;
4414    lastopen = n;
4415    break;
4416   case CLOSE:
4417    n = ARG(scan);  /* which paren pair */
4418    PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4419    PL_regoffs[n].end = locinput - PL_bostr;
4420    /*if (n > PL_regsize)
4421     PL_regsize = n;*/
4422    if (n > *PL_reglastparen)
4423     *PL_reglastparen = n;
4424    *PL_reglastcloseparen = n;
4425    if (cur_eval && cur_eval->u.eval.close_paren == n) {
4426     goto fake_end;
4427    }
4428    break;
4429   case ACCEPT:
4430    if (ARG(scan)){
4431     regnode *cursor;
4432     for (cursor=scan;
4433      cursor && OP(cursor)!=END;
4434      cursor=regnext(cursor))
4435     {
4436      if ( OP(cursor)==CLOSE ){
4437       n = ARG(cursor);
4438       if ( n <= lastopen ) {
4439        PL_regoffs[n].start
4440         = PL_reg_start_tmp[n] - PL_bostr;
4441        PL_regoffs[n].end = locinput - PL_bostr;
4442        /*if (n > PL_regsize)
4443        PL_regsize = n;*/
4444        if (n > *PL_reglastparen)
4445         *PL_reglastparen = n;
4446        *PL_reglastcloseparen = n;
4447        if ( n == ARG(scan) || (cur_eval &&
4448         cur_eval->u.eval.close_paren == n))
4449         break;
4450       }
4451      }
4452     }
4453    }
4454    goto fake_end;
4455    /*NOTREACHED*/
4456   case GROUPP:
4457    n = ARG(scan);  /* which paren pair */
4458    sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4459    break;
4460   case NGROUPP:
4461    /* reg_check_named_buff_matched returns 0 for no match */
4462    sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4463    break;
4464   case INSUBP:
4465    n = ARG(scan);
4466    sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4467    break;
4468   case DEFINEP:
4469    sw = 0;
4470    break;
4471   case IFTHEN:
4472    PL_reg_leftiter = PL_reg_maxiter;  /* Void cache */
4473    if (sw)
4474     next = NEXTOPER(NEXTOPER(scan));
4475    else {
4476     next = scan + ARG(scan);
4477     if (OP(next) == IFTHEN) /* Fake one. */
4478      next = NEXTOPER(NEXTOPER(next));
4479    }
4480    break;
4481   case LOGICAL:
4482    logical = scan->flags;
4483    break;
4484
4485 /*******************************************************************
4486
4487 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4488 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4489 STAR/PLUS/CURLY/CURLYN are used instead.)
4490
4491 A*B is compiled as <CURLYX><A><WHILEM><B>
4492
4493 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4494 state, which contains the current count, initialised to -1. It also sets
4495 cur_curlyx to point to this state, with any previous value saved in the
4496 state block.
4497
4498 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4499 since the pattern may possibly match zero times (i.e. it's a while {} loop
4500 rather than a do {} while loop).
4501
4502 Each entry to WHILEM represents a successful match of A. The count in the
4503 CURLYX block is incremented, another WHILEM state is pushed, and execution
4504 passes to A or B depending on greediness and the current count.
4505
4506 For example, if matching against the string a1a2a3b (where the aN are
4507 substrings that match /A/), then the match progresses as follows: (the
4508 pushed states are interspersed with the bits of strings matched so far):
4509
4510  <CURLYX cnt=-1>
4511  <CURLYX cnt=0><WHILEM>
4512  <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4513  <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4514  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4515  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4516
4517 (Contrast this with something like CURLYM, which maintains only a single
4518 backtrack state:
4519
4520  <CURLYM cnt=0> a1
4521  a1 <CURLYM cnt=1> a2
4522  a1 a2 <CURLYM cnt=2> a3
4523  a1 a2 a3 <CURLYM cnt=3> b
4524 )
4525
4526 Each WHILEM state block marks a point to backtrack to upon partial failure
4527 of A or B, and also contains some minor state data related to that
4528 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4529 overall state, such as the count, and pointers to the A and B ops.
4530
4531 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4532 must always point to the *current* CURLYX block, the rules are:
4533
4534 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4535 and set cur_curlyx to point the new block.
4536
4537 When popping the CURLYX block after a successful or unsuccessful match,
4538 restore the previous cur_curlyx.
4539
4540 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4541 to the outer one saved in the CURLYX block.
4542
4543 When popping the WHILEM block after a successful or unsuccessful B match,
4544 restore the previous cur_curlyx.
4545
4546 Here's an example for the pattern (AI* BI)*BO
4547 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4548
4549 cur_
4550 curlyx backtrack stack
4551 ------ ---------------
4552 NULL
4553 CO     <CO prev=NULL> <WO>
4554 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4555 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4556 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4557
4558 At this point the pattern succeeds, and we work back down the stack to
4559 clean up, restoring as we go:
4560
4561 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4562 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4563 CO     <CO prev=NULL> <WO>
4564 NULL
4565
4566 *******************************************************************/
4567
4568 #define ST st->u.curlyx
4569
4570   case CURLYX:    /* start of /A*B/  (for complex A) */
4571   {
4572    /* No need to save/restore up to this paren */
4573    I32 parenfloor = scan->flags;
4574
4575    assert(next); /* keep Coverity happy */
4576    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4577     next += ARG(next);
4578
4579    /* XXXX Probably it is better to teach regpush to support
4580    parenfloor > PL_regsize... */
4581    if (parenfloor > (I32)*PL_reglastparen)
4582     parenfloor = *PL_reglastparen; /* Pessimization... */
4583
4584    ST.prev_curlyx= cur_curlyx;
4585    cur_curlyx = st;
4586    ST.cp = PL_savestack_ix;
4587
4588    /* these fields contain the state of the current curly.
4589    * they are accessed by subsequent WHILEMs */
4590    ST.parenfloor = parenfloor;
4591    ST.me = scan;
4592    ST.B = next;
4593    ST.minmod = minmod;
4594    minmod = 0;
4595    ST.count = -1; /* this will be updated by WHILEM */
4596    ST.lastloc = NULL;  /* this will be updated by WHILEM */
4597
4598    PL_reginput = locinput;
4599    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4600    /* NOTREACHED */
4601   }
4602
4603   case CURLYX_end: /* just finished matching all of A*B */
4604    cur_curlyx = ST.prev_curlyx;
4605    sayYES;
4606    /* NOTREACHED */
4607
4608   case CURLYX_end_fail: /* just failed to match all of A*B */
4609    regcpblow(ST.cp);
4610    cur_curlyx = ST.prev_curlyx;
4611    sayNO;
4612    /* NOTREACHED */
4613
4614
4615 #undef ST
4616 #define ST st->u.whilem
4617
4618   case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4619   {
4620    /* see the discussion above about CURLYX/WHILEM */
4621    I32 n;
4622    int min = ARG1(cur_curlyx->u.curlyx.me);
4623    int max = ARG2(cur_curlyx->u.curlyx.me);
4624    regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4625
4626    assert(cur_curlyx); /* keep Coverity happy */
4627    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4628    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4629    ST.cache_offset = 0;
4630    ST.cache_mask = 0;
4631
4632    PL_reginput = locinput;
4633
4634    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4635     "%*s  whilem: matched %ld out of %d..%d\n",
4636     REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4637    );
4638
4639    /* First just match a string of min A's. */
4640
4641    if (n < min) {
4642     ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4643     cur_curlyx->u.curlyx.lastloc = locinput;
4644     REGCP_SET(ST.lastcp);
4645
4646     PUSH_STATE_GOTO(WHILEM_A_pre, A);
4647     /* NOTREACHED */
4648    }
4649
4650    /* If degenerate A matches "", assume A done. */
4651
4652    if (locinput == cur_curlyx->u.curlyx.lastloc) {
4653     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4654     "%*s  whilem: empty match detected, trying continuation...\n",
4655     REPORT_CODE_OFF+depth*2, "")
4656     );
4657     goto do_whilem_B_max;
4658    }
4659
4660    /* super-linear cache processing */
4661
4662    if (scan->flags) {
4663
4664     if (!PL_reg_maxiter) {
4665      /* start the countdown: Postpone detection until we
4666      * know the match is not *that* much linear. */
4667      PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4668      /* possible overflow for long strings and many CURLYX's */
4669      if (PL_reg_maxiter < 0)
4670       PL_reg_maxiter = I32_MAX;
4671      PL_reg_leftiter = PL_reg_maxiter;
4672     }
4673
4674     if (PL_reg_leftiter-- == 0) {
4675      /* initialise cache */
4676      const I32 size = (PL_reg_maxiter + 7)/8;
4677      if (PL_reg_poscache) {
4678       if ((I32)PL_reg_poscache_size < size) {
4679        Renew(PL_reg_poscache, size, char);
4680        PL_reg_poscache_size = size;
4681       }
4682       Zero(PL_reg_poscache, size, char);
4683      }
4684      else {
4685       PL_reg_poscache_size = size;
4686       Newxz(PL_reg_poscache, size, char);
4687      }
4688      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4689  "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4690        PL_colors[4], PL_colors[5])
4691      );
4692     }
4693
4694     if (PL_reg_leftiter < 0) {
4695      /* have we already failed at this position? */
4696      I32 offset, mask;
4697      offset  = (scan->flags & 0xf) - 1
4698         + (locinput - PL_bostr)  * (scan->flags>>4);
4699      mask    = 1 << (offset % 8);
4700      offset /= 8;
4701      if (PL_reg_poscache[offset] & mask) {
4702       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4703        "%*s  whilem: (cache) already tried at this position...\n",
4704        REPORT_CODE_OFF+depth*2, "")
4705       );
4706       sayNO; /* cache records failure */
4707      }
4708      ST.cache_offset = offset;
4709      ST.cache_mask   = mask;
4710     }
4711    }
4712
4713    /* Prefer B over A for minimal matching. */
4714
4715    if (cur_curlyx->u.curlyx.minmod) {
4716     ST.save_curlyx = cur_curlyx;
4717     cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4718     ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4719     REGCP_SET(ST.lastcp);
4720     PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4721     /* NOTREACHED */
4722    }
4723
4724    /* Prefer A over B for maximal matching. */
4725
4726    if (n < max) { /* More greed allowed? */
4727     ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4728     cur_curlyx->u.curlyx.lastloc = locinput;
4729     REGCP_SET(ST.lastcp);
4730     PUSH_STATE_GOTO(WHILEM_A_max, A);
4731     /* NOTREACHED */
4732    }
4733    goto do_whilem_B_max;
4734   }
4735   /* NOTREACHED */
4736
4737   case WHILEM_B_min: /* just matched B in a minimal match */
4738   case WHILEM_B_max: /* just matched B in a maximal match */
4739    cur_curlyx = ST.save_curlyx;
4740    sayYES;
4741    /* NOTREACHED */
4742
4743   case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4744    cur_curlyx = ST.save_curlyx;
4745    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4746    cur_curlyx->u.curlyx.count--;
4747    CACHEsayNO;
4748    /* NOTREACHED */
4749
4750   case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4751    /* FALL THROUGH */
4752   case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4753    REGCP_UNWIND(ST.lastcp);
4754    regcppop(rex);
4755    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4756    cur_curlyx->u.curlyx.count--;
4757    CACHEsayNO;
4758    /* NOTREACHED */
4759
4760   case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4761    REGCP_UNWIND(ST.lastcp);
4762    regcppop(rex); /* Restore some previous $<digit>s? */
4763    PL_reginput = locinput;
4764    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4765     "%*s  whilem: failed, trying continuation...\n",
4766     REPORT_CODE_OFF+depth*2, "")
4767    );
4768   do_whilem_B_max:
4769    if (cur_curlyx->u.curlyx.count >= REG_INFTY
4770     && ckWARN(WARN_REGEXP)
4771     && !(PL_reg_flags & RF_warned))
4772    {
4773     PL_reg_flags |= RF_warned;
4774     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4775      "Complex regular subexpression recursion",
4776      REG_INFTY - 1);
4777    }
4778
4779    /* now try B */
4780    ST.save_curlyx = cur_curlyx;
4781    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4782    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4783    /* NOTREACHED */
4784
4785   case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4786    cur_curlyx = ST.save_curlyx;
4787    REGCP_UNWIND(ST.lastcp);
4788    regcppop(rex);
4789
4790    if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4791     /* Maximum greed exceeded */
4792     if (cur_curlyx->u.curlyx.count >= REG_INFTY
4793      && ckWARN(WARN_REGEXP)
4794      && !(PL_reg_flags & RF_warned))
4795     {
4796      PL_reg_flags |= RF_warned;
4797      Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4798       "%s limit (%d) exceeded",
4799       "Complex regular subexpression recursion",
4800       REG_INFTY - 1);
4801     }
4802     cur_curlyx->u.curlyx.count--;
4803     CACHEsayNO;
4804    }
4805
4806    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4807     "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4808    );
4809    /* Try grabbing another A and see if it helps. */
4810    PL_reginput = locinput;
4811    cur_curlyx->u.curlyx.lastloc = locinput;
4812    ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4813    REGCP_SET(ST.lastcp);
4814    PUSH_STATE_GOTO(WHILEM_A_min,
4815     /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4816    /* NOTREACHED */
4817
4818 #undef  ST
4819 #define ST st->u.branch
4820
4821   case BRANCHJ:     /*  /(...|A|...)/ with long next pointer */
4822    next = scan + ARG(scan);
4823    if (next == scan)
4824     next = NULL;
4825    scan = NEXTOPER(scan);
4826    /* FALL THROUGH */
4827
4828   case BRANCH:     /*  /(...|A|...)/ */
4829    scan = NEXTOPER(scan); /* scan now points to inner node */
4830    ST.lastparen = *PL_reglastparen;
4831    ST.next_branch = next;
4832    REGCP_SET(ST.cp);
4833    PL_reginput = locinput;
4834
4835    /* Now go into the branch */
4836    if (has_cutgroup) {
4837     PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4838    } else {
4839     PUSH_STATE_GOTO(BRANCH_next, scan);
4840    }
4841    /* NOTREACHED */
4842   case CUTGROUP:
4843    PL_reginput = locinput;
4844    sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4845     MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4846    PUSH_STATE_GOTO(CUTGROUP_next,next);
4847    /* NOTREACHED */
4848   case CUTGROUP_next_fail:
4849    do_cutgroup = 1;
4850    no_final = 1;
4851    if (st->u.mark.mark_name)
4852     sv_commit = st->u.mark.mark_name;
4853    sayNO;
4854    /* NOTREACHED */
4855   case BRANCH_next:
4856    sayYES;
4857    /* NOTREACHED */
4858   case BRANCH_next_fail: /* that branch failed; try the next, if any */
4859    if (do_cutgroup) {
4860     do_cutgroup = 0;
4861     no_final = 0;
4862    }
4863    REGCP_UNWIND(ST.cp);
4864    for (n = *PL_reglastparen; n > ST.lastparen; n--)
4865     PL_regoffs[n].end = -1;
4866    *PL_reglastparen = n;
4867    /*dmq: *PL_reglastcloseparen = n; */
4868    scan = ST.next_branch;
4869    /* no more branches? */
4870    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4871     DEBUG_EXECUTE_r({
4872      PerlIO_printf( Perl_debug_log,
4873       "%*s  %sBRANCH failed...%s\n",
4874       REPORT_CODE_OFF+depth*2, "",
4875       PL_colors[4],
4876       PL_colors[5] );
4877     });
4878     sayNO_SILENT;
4879    }
4880    continue; /* execute next BRANCH[J] op */
4881    /* NOTREACHED */
4882
4883   case MINMOD:
4884    minmod = 1;
4885    break;
4886
4887 #undef  ST
4888 #define ST st->u.curlym
4889
4890   case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4891
4892    /* This is an optimisation of CURLYX that enables us to push
4893    * only a single backtracking state, no matter how many matches
4894    * there are in {m,n}. It relies on the pattern being constant
4895    * length, with no parens to influence future backrefs
4896    */
4897
4898    ST.me = scan;
4899    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4900
4901    /* if paren positive, emulate an OPEN/CLOSE around A */
4902    if (ST.me->flags) {
4903     U32 paren = ST.me->flags;
4904     if (paren > PL_regsize)
4905      PL_regsize = paren;
4906     if (paren > *PL_reglastparen)
4907      *PL_reglastparen = paren;
4908     scan += NEXT_OFF(scan); /* Skip former OPEN. */
4909    }
4910    ST.A = scan;
4911    ST.B = next;
4912    ST.alen = 0;
4913    ST.count = 0;
4914    ST.minmod = minmod;
4915    minmod = 0;
4916    ST.c1 = CHRTEST_UNINIT;
4917    REGCP_SET(ST.cp);
4918
4919    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4920     goto curlym_do_B;
4921
4922   curlym_do_A: /* execute the A in /A{m,n}B/  */
4923    PL_reginput = locinput;
4924    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4925    /* NOTREACHED */
4926
4927   case CURLYM_A: /* we've just matched an A */
4928    locinput = st->locinput;
4929    nextchr = UCHARAT(locinput);
4930
4931    ST.count++;
4932    /* after first match, determine A's length: u.curlym.alen */
4933    if (ST.count == 1) {
4934     if (PL_reg_match_utf8) {
4935      char *s = locinput;
4936      while (s < PL_reginput) {
4937       ST.alen++;
4938       s += UTF8SKIP(s);
4939      }
4940     }
4941     else {
4942      ST.alen = PL_reginput - locinput;
4943     }
4944     if (ST.alen == 0)
4945      ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4946    }
4947    DEBUG_EXECUTE_r(
4948     PerlIO_printf(Perl_debug_log,
4949       "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4950       (int)(REPORT_CODE_OFF+(depth*2)), "",
4951       (IV) ST.count, (IV)ST.alen)
4952    );
4953
4954    locinput = PL_reginput;
4955
4956    if (cur_eval && cur_eval->u.eval.close_paren &&
4957     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4958     goto fake_end;
4959
4960    {
4961     I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4962     if ( max == REG_INFTY || ST.count < max )
4963      goto curlym_do_A; /* try to match another A */
4964    }
4965    goto curlym_do_B; /* try to match B */
4966
4967   case CURLYM_A_fail: /* just failed to match an A */
4968    REGCP_UNWIND(ST.cp);
4969
4970    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4971     || (cur_eval && cur_eval->u.eval.close_paren &&
4972      cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4973     sayNO;
4974
4975   curlym_do_B: /* execute the B in /A{m,n}B/  */
4976    PL_reginput = locinput;
4977    if (ST.c1 == CHRTEST_UNINIT) {
4978     /* calculate c1 and c2 for possible match of 1st char
4979     * following curly */
4980     ST.c1 = ST.c2 = CHRTEST_VOID;
4981     if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4982      regnode *text_node = ST.B;
4983      if (! HAS_TEXT(text_node))
4984       FIND_NEXT_IMPT(text_node);
4985      /* this used to be
4986
4987       (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4988
4989        But the former is redundant in light of the latter.
4990
4991        if this changes back then the macro for
4992        IS_TEXT and friends need to change.
4993      */
4994      if (PL_regkind[OP(text_node)] == EXACT)
4995      {
4996
4997       ST.c1 = (U8)*STRING(text_node);
4998       switch (OP(text_node)) {
4999        case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5000        case EXACTFA:
5001        case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5002        case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5003        default: ST.c2 = ST.c1;
5004       }
5005      }
5006     }
5007    }
5008
5009    DEBUG_EXECUTE_r(
5010     PerlIO_printf(Perl_debug_log,
5011      "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5012      (int)(REPORT_CODE_OFF+(depth*2)),
5013      "", (IV)ST.count)
5014     );
5015    if (ST.c1 != CHRTEST_VOID
5016      && UCHARAT(PL_reginput) != ST.c1
5017      && UCHARAT(PL_reginput) != ST.c2)
5018    {
5019     /* simulate B failing */
5020     DEBUG_OPTIMISE_r(
5021      PerlIO_printf(Perl_debug_log,
5022       "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5023       (int)(REPORT_CODE_OFF+(depth*2)),"",
5024       (IV)ST.c1,(IV)ST.c2
5025     ));
5026     state_num = CURLYM_B_fail;
5027     goto reenter_switch;
5028    }
5029
5030    if (ST.me->flags) {
5031     /* mark current A as captured */
5032     I32 paren = ST.me->flags;
5033     if (ST.count) {
5034      PL_regoffs[paren].start
5035       = HOPc(PL_reginput, -ST.alen) - PL_bostr;
5036      PL_regoffs[paren].end = PL_reginput - PL_bostr;
5037      /*dmq: *PL_reglastcloseparen = paren; */
5038     }
5039     else
5040      PL_regoffs[paren].end = -1;
5041     if (cur_eval && cur_eval->u.eval.close_paren &&
5042      cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5043     {
5044      if (ST.count)
5045       goto fake_end;
5046      else
5047       sayNO;
5048     }
5049    }
5050
5051    PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
5052    /* NOTREACHED */
5053
5054   case CURLYM_B_fail: /* just failed to match a B */
5055    REGCP_UNWIND(ST.cp);
5056    if (ST.minmod) {
5057     I32 max = ARG2(ST.me);
5058     if (max != REG_INFTY && ST.count == max)
5059      sayNO;
5060     goto curlym_do_A; /* try to match a further A */
5061    }
5062    /* backtrack one A */
5063    if (ST.count == ARG1(ST.me) /* min */)
5064     sayNO;
5065    ST.count--;
5066    locinput = HOPc(locinput, -ST.alen);
5067    goto curlym_do_B; /* try to match B */
5068
5069 #undef ST
5070 #define ST st->u.curly
5071
5072 #define CURLY_SETPAREN(paren, success) \
5073  if (paren) { \
5074   if (success) { \
5075    PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5076    PL_regoffs[paren].end = locinput - PL_bostr; \
5077    *PL_reglastcloseparen = paren; \
5078   } \
5079   else \
5080    PL_regoffs[paren].end = -1; \
5081  }
5082
5083   case STAR:  /*  /A*B/ where A is width 1 */
5084    ST.paren = 0;
5085    ST.min = 0;
5086    ST.max = REG_INFTY;
5087    scan = NEXTOPER(scan);
5088    goto repeat;
5089   case PLUS:  /*  /A+B/ where A is width 1 */
5090    ST.paren = 0;
5091    ST.min = 1;
5092    ST.max = REG_INFTY;
5093    scan = NEXTOPER(scan);
5094    goto repeat;
5095   case CURLYN:  /*  /(A){m,n}B/ where A is width 1 */
5096    ST.paren = scan->flags; /* Which paren to set */
5097    if (ST.paren > PL_regsize)
5098     PL_regsize = ST.paren;
5099    if (ST.paren > *PL_reglastparen)
5100     *PL_reglastparen = ST.paren;
5101    ST.min = ARG1(scan);  /* min to match */
5102    ST.max = ARG2(scan);  /* max to match */
5103    if (cur_eval && cur_eval->u.eval.close_paren &&
5104     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5105     ST.min=1;
5106     ST.max=1;
5107    }
5108    scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5109    goto repeat;
5110   case CURLY:  /*  /A{m,n}B/ where A is width 1 */
5111    ST.paren = 0;
5112    ST.min = ARG1(scan);  /* min to match */
5113    ST.max = ARG2(scan);  /* max to match */
5114    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5115   repeat:
5116    /*
5117    * Lookahead to avoid useless match attempts
5118    * when we know what character comes next.
5119    *
5120    * Used to only do .*x and .*?x, but now it allows
5121    * for )'s, ('s and (?{ ... })'s to be in the way
5122    * of the quantifier and the EXACT-like node.  -- japhy
5123    */
5124
5125    if (ST.min > ST.max) /* XXX make this a compile-time check? */
5126     sayNO;
5127    if (HAS_TEXT(next) || JUMPABLE(next)) {
5128     U8 *s;
5129     regnode *text_node = next;
5130
5131     if (! HAS_TEXT(text_node))
5132      FIND_NEXT_IMPT(text_node);
5133
5134     if (! HAS_TEXT(text_node))
5135      ST.c1 = ST.c2 = CHRTEST_VOID;
5136     else {
5137      if ( PL_regkind[OP(text_node)] != EXACT ) {
5138       ST.c1 = ST.c2 = CHRTEST_VOID;
5139       goto assume_ok_easy;
5140      }
5141      else
5142       s = (U8*)STRING(text_node);
5143
5144      /*  Currently we only get here when
5145
5146       PL_rekind[OP(text_node)] == EXACT
5147
5148       if this changes back then the macro for IS_TEXT and
5149       friends need to change. */
5150      if (!UTF_PATTERN) {
5151       ST.c1 = *s;
5152       switch (OP(text_node)) {
5153        case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5154        case EXACTFA:
5155        case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5156        case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5157        default: ST.c2 = ST.c1; break;
5158       }
5159      }
5160      else { /* UTF_PATTERN */
5161       if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5162        STRLEN ulen1, ulen2;
5163        U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5164        U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5165
5166        to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5167        to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5168 #ifdef EBCDIC
5169        ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5170              ckWARN(WARN_UTF8) ?
5171              0 : UTF8_ALLOW_ANY);
5172        ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5173              ckWARN(WARN_UTF8) ?
5174              0 : UTF8_ALLOW_ANY);
5175 #else
5176        ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5177              uniflags);
5178        ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5179              uniflags);
5180 #endif
5181       }
5182       else {
5183        ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5184              uniflags);
5185       }
5186      }
5187     }
5188    }
5189    else
5190     ST.c1 = ST.c2 = CHRTEST_VOID;
5191   assume_ok_easy:
5192
5193    ST.A = scan;
5194    ST.B = next;
5195    PL_reginput = locinput;
5196    if (minmod) {
5197     minmod = 0;
5198     if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5199      sayNO;
5200     ST.count = ST.min;
5201     locinput = PL_reginput;
5202     REGCP_SET(ST.cp);
5203     if (ST.c1 == CHRTEST_VOID)
5204      goto curly_try_B_min;
5205
5206     ST.oldloc = locinput;
5207
5208     /* set ST.maxpos to the furthest point along the
5209     * string that could possibly match */
5210     if  (ST.max == REG_INFTY) {
5211      ST.maxpos = PL_regeol - 1;
5212      if (utf8_target)
5213       while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5214        ST.maxpos--;
5215     }
5216     else if (utf8_target) {
5217      int m = ST.max - ST.min;
5218      for (ST.maxpos = locinput;
5219       m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5220       ST.maxpos += UTF8SKIP(ST.maxpos);
5221     }
5222     else {
5223      ST.maxpos = locinput + ST.max - ST.min;
5224      if (ST.maxpos >= PL_regeol)
5225       ST.maxpos = PL_regeol - 1;
5226     }
5227     goto curly_try_B_min_known;
5228
5229    }
5230    else {
5231     ST.count = regrepeat(rex, ST.A, ST.max, depth);
5232     locinput = PL_reginput;
5233     if (ST.count < ST.min)
5234      sayNO;
5235     if ((ST.count > ST.min)
5236      && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5237     {
5238      /* A{m,n} must come at the end of the string, there's
5239      * no point in backing off ... */
5240      ST.min = ST.count;
5241      /* ...except that $ and \Z can match before *and* after
5242      newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
5243      We may back off by one in this case. */
5244      if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5245       ST.min--;
5246     }
5247     REGCP_SET(ST.cp);
5248     goto curly_try_B_max;
5249    }
5250    /* NOTREACHED */
5251
5252
5253   case CURLY_B_min_known_fail:
5254    /* failed to find B in a non-greedy match where c1,c2 valid */
5255    if (ST.paren && ST.count)
5256     PL_regoffs[ST.paren].end = -1;
5257
5258    PL_reginput = locinput; /* Could be reset... */
5259    REGCP_UNWIND(ST.cp);
5260    /* Couldn't or didn't -- move forward. */
5261    ST.oldloc = locinput;
5262    if (utf8_target)
5263     locinput += UTF8SKIP(locinput);
5264    else
5265     locinput++;
5266    ST.count++;
5267   curly_try_B_min_known:
5268    /* find the next place where 'B' could work, then call B */
5269    {
5270     int n;
5271     if (utf8_target) {
5272      n = (ST.oldloc == locinput) ? 0 : 1;
5273      if (ST.c1 == ST.c2) {
5274       STRLEN len;
5275       /* set n to utf8_distance(oldloc, locinput) */
5276       while (locinput <= ST.maxpos &&
5277        utf8n_to_uvchr((U8*)locinput,
5278            UTF8_MAXBYTES, &len,
5279            uniflags) != (UV)ST.c1) {
5280        locinput += len;
5281        n++;
5282       }
5283      }
5284      else {
5285       /* set n to utf8_distance(oldloc, locinput) */
5286       while (locinput <= ST.maxpos) {
5287        STRLEN len;
5288        const UV c = utf8n_to_uvchr((U8*)locinput,
5289             UTF8_MAXBYTES, &len,
5290             uniflags);
5291        if (c == (UV)ST.c1 || c == (UV)ST.c2)
5292         break;
5293        locinput += len;
5294        n++;
5295       }
5296      }
5297     }
5298     else {
5299      if (ST.c1 == ST.c2) {
5300       while (locinput <= ST.maxpos &&
5301        UCHARAT(locinput) != ST.c1)
5302        locinput++;
5303      }
5304      else {
5305       while (locinput <= ST.maxpos
5306        && UCHARAT(locinput) != ST.c1
5307        && UCHARAT(locinput) != ST.c2)
5308        locinput++;
5309      }
5310      n = locinput - ST.oldloc;
5311     }
5312     if (locinput > ST.maxpos)
5313      sayNO;
5314     /* PL_reginput == oldloc now */
5315     if (n) {
5316      ST.count += n;
5317      if (regrepeat(rex, ST.A, n, depth) < n)
5318       sayNO;
5319     }
5320     PL_reginput = locinput;
5321     CURLY_SETPAREN(ST.paren, ST.count);
5322     if (cur_eval && cur_eval->u.eval.close_paren &&
5323      cur_eval->u.eval.close_paren == (U32)ST.paren) {
5324      goto fake_end;
5325     }
5326     PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5327    }
5328    /* NOTREACHED */
5329
5330
5331   case CURLY_B_min_fail:
5332    /* failed to find B in a non-greedy match where c1,c2 invalid */
5333    if (ST.paren && ST.count)
5334     PL_regoffs[ST.paren].end = -1;
5335
5336    REGCP_UNWIND(ST.cp);
5337    /* failed -- move forward one */
5338    PL_reginput = locinput;
5339    if (regrepeat(rex, ST.A, 1, depth)) {
5340     ST.count++;
5341     locinput = PL_reginput;
5342     if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5343       ST.count > 0)) /* count overflow ? */
5344     {
5345     curly_try_B_min:
5346      CURLY_SETPAREN(ST.paren, ST.count);
5347      if (cur_eval && cur_eval->u.eval.close_paren &&
5348       cur_eval->u.eval.close_paren == (U32)ST.paren) {
5349       goto fake_end;
5350      }
5351      PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5352     }
5353    }
5354    sayNO;
5355    /* NOTREACHED */
5356
5357
5358   curly_try_B_max:
5359    /* a successful greedy match: now try to match B */
5360    if (cur_eval && cur_eval->u.eval.close_paren &&
5361     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5362     goto fake_end;
5363    }
5364    {
5365     UV c = 0;
5366     if (ST.c1 != CHRTEST_VOID)
5367      c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5368           UTF8_MAXBYTES, 0, uniflags)
5369         : (UV) UCHARAT(PL_reginput);
5370     /* If it could work, try it. */
5371     if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5372      CURLY_SETPAREN(ST.paren, ST.count);
5373      PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5374      /* NOTREACHED */
5375     }
5376    }
5377    /* FALL THROUGH */
5378   case CURLY_B_max_fail:
5379    /* failed to find B in a greedy match */
5380    if (ST.paren && ST.count)
5381     PL_regoffs[ST.paren].end = -1;
5382
5383    REGCP_UNWIND(ST.cp);
5384    /*  back up. */
5385    if (--ST.count < ST.min)
5386     sayNO;
5387    PL_reginput = locinput = HOPc(locinput, -1);
5388    goto curly_try_B_max;
5389
5390 #undef ST
5391
5392   case END:
5393    fake_end:
5394    if (cur_eval) {
5395     /* we've just finished A in /(??{A})B/; now continue with B */
5396     I32 tmpix;
5397     st->u.eval.toggle_reg_flags
5398        = cur_eval->u.eval.toggle_reg_flags;
5399     PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5400
5401     st->u.eval.prev_rex = rex_sv;  /* inner */
5402     SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5403     rex = (struct regexp *)SvANY(rex_sv);
5404     rexi = RXi_GET(rex);
5405     cur_curlyx = cur_eval->u.eval.prev_curlyx;
5406     (void)ReREFCNT_inc(rex_sv);
5407     st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
5408
5409     /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5410     PL_reglastparen = &rex->lastparen;
5411     PL_reglastcloseparen = &rex->lastcloseparen;
5412
5413     REGCP_SET(st->u.eval.lastcp);
5414     PL_reginput = locinput;
5415
5416     /* Restore parens of the outer rex without popping the
5417     * savestack */
5418     tmpix = PL_savestack_ix;
5419     PL_savestack_ix = cur_eval->u.eval.lastcp;
5420     regcppop(rex);
5421     PL_savestack_ix = tmpix;
5422
5423     st->u.eval.prev_eval = cur_eval;
5424     cur_eval = cur_eval->u.eval.prev_eval;
5425     DEBUG_EXECUTE_r(
5426      PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5427          REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5428     if ( nochange_depth )
5429      nochange_depth--;
5430
5431     PUSH_YES_STATE_GOTO(EVAL_AB,
5432       st->u.eval.prev_eval->u.eval.B); /* match B */
5433    }
5434
5435    if (locinput < reginfo->till) {
5436     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5437          "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5438          PL_colors[4],
5439          (long)(locinput - PL_reg_starttry),
5440          (long)(reginfo->till - PL_reg_starttry),
5441          PL_colors[5]));
5442
5443     sayNO_SILENT;  /* Cannot match: too short. */
5444    }
5445    PL_reginput = locinput; /* put where regtry can find it */
5446    sayYES;   /* Success! */
5447
5448   case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5449    DEBUG_EXECUTE_r(
5450    PerlIO_printf(Perl_debug_log,
5451     "%*s  %ssubpattern success...%s\n",
5452     REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5453    PL_reginput = locinput; /* put where regtry can find it */
5454    sayYES;   /* Success! */
5455
5456 #undef  ST
5457 #define ST st->u.ifmatch
5458
5459   case SUSPEND: /* (?>A) */
5460    ST.wanted = 1;
5461    PL_reginput = locinput;
5462    goto do_ifmatch;
5463
5464   case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5465    ST.wanted = 0;
5466    goto ifmatch_trivial_fail_test;
5467
5468   case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5469    ST.wanted = 1;
5470   ifmatch_trivial_fail_test:
5471    if (scan->flags) {
5472     char * const s = HOPBACKc(locinput, scan->flags);
5473     if (!s) {
5474      /* trivial fail */
5475      if (logical) {
5476       logical = 0;
5477       sw = 1 - cBOOL(ST.wanted);
5478      }
5479      else if (ST.wanted)
5480       sayNO;
5481      next = scan + ARG(scan);
5482      if (next == scan)
5483       next = NULL;
5484      break;
5485     }
5486     PL_reginput = s;
5487    }
5488    else
5489     PL_reginput = locinput;
5490
5491   do_ifmatch:
5492    ST.me = scan;
5493    ST.logical = logical;
5494    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5495
5496    /* execute body of (?...A) */
5497    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5498    /* NOTREACHED */
5499
5500   case IFMATCH_A_fail: /* body of (?...A) failed */
5501    ST.wanted = !ST.wanted;
5502    /* FALL THROUGH */
5503
5504   case IFMATCH_A: /* body of (?...A) succeeded */
5505    if (ST.logical) {
5506     sw = cBOOL(ST.wanted);
5507    }
5508    else if (!ST.wanted)
5509     sayNO;
5510
5511    if (OP(ST.me) == SUSPEND)
5512     locinput = PL_reginput;
5513    else {
5514     locinput = PL_reginput = st->locinput;
5515     nextchr = UCHARAT(locinput);
5516    }
5517    scan = ST.me + ARG(ST.me);
5518    if (scan == ST.me)
5519     scan = NULL;
5520    continue; /* execute B */
5521
5522 #undef ST
5523
5524   case LONGJMP:
5525    next = scan + ARG(scan);
5526    if (next == scan)
5527     next = NULL;
5528    break;
5529   case COMMIT:
5530    reginfo->cutpoint = PL_regeol;
5531    /* FALLTHROUGH */
5532   case PRUNE:
5533    PL_reginput = locinput;
5534    if (!scan->flags)
5535     sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5536    PUSH_STATE_GOTO(COMMIT_next,next);
5537    /* NOTREACHED */
5538   case COMMIT_next_fail:
5539    no_final = 1;
5540    /* FALLTHROUGH */
5541   case OPFAIL:
5542    sayNO;
5543    /* NOTREACHED */
5544
5545 #define ST st->u.mark
5546   case MARKPOINT:
5547    ST.prev_mark = mark_state;
5548    ST.mark_name = sv_commit = sv_yes_mark
5549     = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5550    mark_state = st;
5551    ST.mark_loc = PL_reginput = locinput;
5552    PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5553    /* NOTREACHED */
5554   case MARKPOINT_next:
5555    mark_state = ST.prev_mark;
5556    sayYES;
5557    /* NOTREACHED */
5558   case MARKPOINT_next_fail:
5559    if (popmark && sv_eq(ST.mark_name,popmark))
5560    {
5561     if (ST.mark_loc > startpoint)
5562      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5563     popmark = NULL; /* we found our mark */
5564     sv_commit = ST.mark_name;
5565
5566     DEBUG_EXECUTE_r({
5567       PerlIO_printf(Perl_debug_log,
5568        "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5569        REPORT_CODE_OFF+depth*2, "",
5570        PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5571     });
5572    }
5573    mark_state = ST.prev_mark;
5574    sv_yes_mark = mark_state ?
5575     mark_state->u.mark.mark_name : NULL;
5576    sayNO;
5577    /* NOTREACHED */
5578   case SKIP:
5579    PL_reginput = locinput;
5580    if (scan->flags) {
5581     /* (*SKIP) : if we fail we cut here*/
5582     ST.mark_name = NULL;
5583     ST.mark_loc = locinput;
5584     PUSH_STATE_GOTO(SKIP_next,next);
5585    } else {
5586     /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5587     otherwise do nothing.  Meaning we need to scan
5588     */
5589     regmatch_state *cur = mark_state;
5590     SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5591
5592     while (cur) {
5593      if ( sv_eq( cur->u.mark.mark_name,
5594         find ) )
5595      {
5596       ST.mark_name = find;
5597       PUSH_STATE_GOTO( SKIP_next, next );
5598      }
5599      cur = cur->u.mark.prev_mark;
5600     }
5601    }
5602    /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5603    break;
5604   case SKIP_next_fail:
5605    if (ST.mark_name) {
5606     /* (*CUT:NAME) - Set up to search for the name as we
5607     collapse the stack*/
5608     popmark = ST.mark_name;
5609    } else {
5610     /* (*CUT) - No name, we cut here.*/
5611     if (ST.mark_loc > startpoint)
5612      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5613     /* but we set sv_commit to latest mark_name if there
5614     is one so they can test to see how things lead to this
5615     cut */
5616     if (mark_state)
5617      sv_commit=mark_state->u.mark.mark_name;
5618    }
5619    no_final = 1;
5620    sayNO;
5621    /* NOTREACHED */
5622 #undef ST
5623   case FOLDCHAR:
5624    n = ARG(scan);
5625    if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5626     locinput += ln;
5627    } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) {
5628     sayNO;
5629    } else  {
5630     U8 folded[UTF8_MAXBYTES_CASE+1];
5631     STRLEN foldlen;
5632     const char * const l = locinput;
5633     char *e = PL_regeol;
5634     to_uni_fold(n, folded, &foldlen);
5635
5636     if (! foldEQ_utf8((const char*) folded, 0,  foldlen, 1,
5637        l, &e, 0,  utf8_target)) {
5638       sayNO;
5639     }
5640     locinput = e;
5641    }
5642    nextchr = UCHARAT(locinput);
5643    break;
5644   case LNBREAK:
5645    if ((n=is_LNBREAK(locinput,utf8_target))) {
5646     locinput += n;
5647     nextchr = UCHARAT(locinput);
5648    } else
5649     sayNO;
5650    break;
5651
5652 #define CASE_CLASS(nAmE)                              \
5653   case nAmE:                                    \
5654    if ((n=is_##nAmE(locinput,utf8_target))) {    \
5655     locinput += n;                        \
5656     nextchr = UCHARAT(locinput);          \
5657    } else                                    \
5658     sayNO;                                \
5659    break;                                    \
5660   case N##nAmE:                                 \
5661    if ((n=is_##nAmE(locinput,utf8_target))) {    \
5662     sayNO;                                \
5663    } else {                                  \
5664     locinput += UTF8SKIP(locinput);       \
5665     nextchr = UCHARAT(locinput);          \
5666    }                                         \
5667    break
5668
5669   CASE_CLASS(VERTWS);
5670   CASE_CLASS(HORIZWS);
5671 #undef CASE_CLASS
5672
5673   default:
5674    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5675       PTR2UV(scan), OP(scan));
5676    Perl_croak(aTHX_ "regexp memory corruption");
5677
5678   } /* end switch */
5679
5680   /* switch break jumps here */
5681   scan = next; /* prepare to execute the next op and ... */
5682   continue;    /* ... jump back to the top, reusing st */
5683   /* NOTREACHED */
5684
5685  push_yes_state:
5686   /* push a state that backtracks on success */
5687   st->u.yes.prev_yes_state = yes_state;
5688   yes_state = st;
5689   /* FALL THROUGH */
5690  push_state:
5691   /* push a new regex state, then continue at scan  */
5692   {
5693    regmatch_state *newst;
5694
5695    DEBUG_STACK_r({
5696     regmatch_state *cur = st;
5697     regmatch_state *curyes = yes_state;
5698     int curd = depth;
5699     regmatch_slab *slab = PL_regmatch_slab;
5700     for (;curd > -1;cur--,curd--) {
5701      if (cur < SLAB_FIRST(slab)) {
5702       slab = slab->prev;
5703       cur = SLAB_LAST(slab);
5704      }
5705      PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5706       REPORT_CODE_OFF + 2 + depth * 2,"",
5707       curd, PL_reg_name[cur->resume_state],
5708       (curyes == cur) ? "yes" : ""
5709      );
5710      if (curyes == cur)
5711       curyes = cur->u.yes.prev_yes_state;
5712     }
5713    } else
5714     DEBUG_STATE_pp("push")
5715    );
5716    depth++;
5717    st->locinput = locinput;
5718    newst = st+1;
5719    if (newst >  SLAB_LAST(PL_regmatch_slab))
5720     newst = S_push_slab(aTHX);
5721    PL_regmatch_state = newst;
5722
5723    locinput = PL_reginput;
5724    nextchr = UCHARAT(locinput);
5725    st = newst;
5726    continue;
5727    /* NOTREACHED */
5728   }
5729  }
5730
5731  /*
5732  * We get here only if there's trouble -- normally "case END" is
5733  * the terminating point.
5734  */
5735  Perl_croak(aTHX_ "corrupted regexp pointers");
5736  /*NOTREACHED*/
5737  sayNO;
5738
5739 yes:
5740  if (yes_state) {
5741   /* we have successfully completed a subexpression, but we must now
5742   * pop to the state marked by yes_state and continue from there */
5743   assert(st != yes_state);
5744 #ifdef DEBUGGING
5745   while (st != yes_state) {
5746    st--;
5747    if (st < SLAB_FIRST(PL_regmatch_slab)) {
5748     PL_regmatch_slab = PL_regmatch_slab->prev;
5749     st = SLAB_LAST(PL_regmatch_slab);
5750    }
5751    DEBUG_STATE_r({
5752     if (no_final) {
5753      DEBUG_STATE_pp("pop (no final)");
5754     } else {
5755      DEBUG_STATE_pp("pop (yes)");
5756     }
5757    });
5758    depth--;
5759   }
5760 #else
5761   while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5762    || yes_state > SLAB_LAST(PL_regmatch_slab))
5763   {
5764    /* not in this slab, pop slab */
5765    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5766    PL_regmatch_slab = PL_regmatch_slab->prev;
5767    st = SLAB_LAST(PL_regmatch_slab);
5768   }
5769   depth -= (st - yes_state);
5770 #endif
5771   st = yes_state;
5772   yes_state = st->u.yes.prev_yes_state;
5773   PL_regmatch_state = st;
5774
5775   if (no_final) {
5776    locinput= st->locinput;
5777    nextchr = UCHARAT(locinput);
5778   }
5779   state_num = st->resume_state + no_final;
5780   goto reenter_switch;
5781  }
5782
5783  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5784       PL_colors[4], PL_colors[5]));
5785
5786  if (PL_reg_eval_set) {
5787   /* each successfully executed (?{...}) block does the equivalent of
5788   *   local $^R = do {...}
5789   * When popping the save stack, all these locals would be undone;
5790   * bypass this by setting the outermost saved $^R to the latest
5791   * value */
5792   if (oreplsv != GvSV(PL_replgv))
5793    sv_setsv(oreplsv, GvSV(PL_replgv));
5794  }
5795  result = 1;
5796  goto final_exit;
5797
5798 no:
5799  DEBUG_EXECUTE_r(
5800   PerlIO_printf(Perl_debug_log,
5801    "%*s  %sfailed...%s\n",
5802    REPORT_CODE_OFF+depth*2, "",
5803    PL_colors[4], PL_colors[5])
5804   );
5805
5806 no_silent:
5807  if (no_final) {
5808   if (yes_state) {
5809    goto yes;
5810   } else {
5811    goto final_exit;
5812   }
5813  }
5814  if (depth) {
5815   /* there's a previous state to backtrack to */
5816   st--;
5817   if (st < SLAB_FIRST(PL_regmatch_slab)) {
5818    PL_regmatch_slab = PL_regmatch_slab->prev;
5819    st = SLAB_LAST(PL_regmatch_slab);
5820   }
5821   PL_regmatch_state = st;
5822   locinput= st->locinput;
5823   nextchr = UCHARAT(locinput);
5824
5825   DEBUG_STATE_pp("pop");
5826   depth--;
5827   if (yes_state == st)
5828    yes_state = st->u.yes.prev_yes_state;
5829
5830   state_num = st->resume_state + 1; /* failure = success + 1 */
5831   goto reenter_switch;
5832  }
5833  result = 0;
5834
5835   final_exit:
5836  if (rex->intflags & PREGf_VERBARG_SEEN) {
5837   SV *sv_err = get_sv("REGERROR", 1);
5838   SV *sv_mrk = get_sv("REGMARK", 1);
5839   if (result) {
5840    sv_commit = &PL_sv_no;
5841    if (!sv_yes_mark)
5842     sv_yes_mark = &PL_sv_yes;
5843   } else {
5844    if (!sv_commit)
5845     sv_commit = &PL_sv_yes;
5846    sv_yes_mark = &PL_sv_no;
5847   }
5848   sv_setsv(sv_err, sv_commit);
5849   sv_setsv(sv_mrk, sv_yes_mark);
5850  }
5851
5852  /* clean up; in particular, free all slabs above current one */
5853  LEAVE_SCOPE(oldsave);
5854
5855  return result;
5856 }
5857
5858 /*
5859  - regrepeat - repeatedly match something simple, report how many
5860  */
5861 /*
5862  * [This routine now assumes that it will only match on things of length 1.
5863  * That was true before, but now we assume scan - reginput is the count,
5864  * rather than incrementing count on every character.  [Er, except utf8.]]
5865  */
5866 STATIC I32
5867 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5868 {
5869  dVAR;
5870  register char *scan;
5871  register I32 c;
5872  register char *loceol = PL_regeol;
5873  register I32 hardcount = 0;
5874  register bool utf8_target = PL_reg_match_utf8;
5875  UV utf8_flags;
5876 #ifndef DEBUGGING
5877  PERL_UNUSED_ARG(depth);
5878 #endif
5879
5880  PERL_ARGS_ASSERT_REGREPEAT;
5881
5882  scan = PL_reginput;
5883  if (max == REG_INFTY)
5884   max = I32_MAX;
5885  else if (max < loceol - scan)
5886   loceol = scan + max;
5887  switch (OP(p)) {
5888  case REG_ANY:
5889   if (utf8_target) {
5890    loceol = PL_regeol;
5891    while (scan < loceol && hardcount < max && *scan != '\n') {
5892     scan += UTF8SKIP(scan);
5893     hardcount++;
5894    }
5895   } else {
5896    while (scan < loceol && *scan != '\n')
5897     scan++;
5898   }
5899   break;
5900  case SANY:
5901   if (utf8_target) {
5902    loceol = PL_regeol;
5903    while (scan < loceol && hardcount < max) {
5904     scan += UTF8SKIP(scan);
5905     hardcount++;
5906    }
5907   }
5908   else
5909    scan = loceol;
5910   break;
5911  case CANY:
5912   scan = loceol;
5913   break;
5914  case EXACT:
5915   /* To get here, EXACTish nodes must have *byte* length == 1.  That
5916   * means they match only characters in the string that can be expressed
5917   * as a single byte.  For non-utf8 strings, that means a simple match.
5918   * For utf8 strings, the character matched must be an invariant, or
5919   * downgradable to a single byte.  The pattern's utf8ness is
5920   * irrelevant, as since it's a single byte, it either isn't utf8, or if
5921   * it is, it's an invariant */
5922
5923   c = (U8)*STRING(p);
5924   assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5925
5926   if (! utf8_target || UNI_IS_INVARIANT(c)) {
5927    while (scan < loceol && UCHARAT(scan) == c) {
5928     scan++;
5929    }
5930   }
5931   else {
5932
5933    /* Here, the string is utf8, and the pattern char is different
5934    * in utf8 than not, so can't compare them directly.  Outside the
5935    * loop, find find the two utf8 bytes that represent c, and then
5936    * look for those in sequence in the utf8 string */
5937    U8 high = UTF8_TWO_BYTE_HI(c);
5938    U8 low = UTF8_TWO_BYTE_LO(c);
5939    loceol = PL_regeol;
5940
5941    while (hardcount < max
5942      && scan + 1 < loceol
5943      && UCHARAT(scan) == high
5944      && UCHARAT(scan + 1) == low)
5945    {
5946     scan += 2;
5947     hardcount++;
5948    }
5949   }
5950   break;
5951  case EXACTFA:
5952   utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5953   goto do_exactf;
5954
5955  case EXACTFL:
5956   PL_reg_flags |= RF_tainted;
5957   utf8_flags = FOLDEQ_UTF8_LOCALE;
5958   goto do_exactf;
5959
5960  case EXACTF:
5961  case EXACTFU:
5962   utf8_flags = 0;
5963
5964   /* The comments for the EXACT case above apply as well to these fold
5965   * ones */
5966
5967  do_exactf:
5968   c = (U8)*STRING(p);
5969   assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5970
5971   if (utf8_target) { /* Use full Unicode fold matching */
5972    char *tmpeol = loceol;
5973    while (hardcount < max
5974      && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
5975         STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
5976    {
5977     scan = tmpeol;
5978     tmpeol = loceol;
5979     hardcount++;
5980    }
5981
5982    /* XXX Note that the above handles properly the German sharp s in
5983    * the pattern matching ss in the string.  But it doesn't handle
5984    * properly cases where the string contains say 'LIGATURE ff' and
5985    * the pattern is 'f+'.  This would require, say, a new function or
5986    * revised interface to foldEQ_utf8(), in which the maximum number
5987    * of characters to match could be passed and it would return how
5988    * many actually did.  This is just one of many cases where
5989    * multi-char folds don't work properly, and so the fix is being
5990    * deferred */
5991   }
5992   else {
5993    U8 folded;
5994
5995    /* Here, the string isn't utf8 and c is a single byte; and either
5996    * the pattern isn't utf8 or c is an invariant, so its utf8ness
5997    * doesn't affect c.  Can just do simple comparisons for exact or
5998    * fold matching. */
5999    switch (OP(p)) {
6000     case EXACTF: folded = PL_fold[c]; break;
6001     case EXACTFA:
6002     case EXACTFU: folded = PL_fold_latin1[c]; break;
6003     case EXACTFL: folded = PL_fold_locale[c]; break;
6004     default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6005    }
6006    while (scan < loceol &&
6007     (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6008    {
6009     scan++;
6010    }
6011   }
6012   break;
6013  case ANYOFV:
6014  case ANYOF:
6015   if (utf8_target || OP(p) == ANYOFV) {
6016    STRLEN inclasslen;
6017    loceol = PL_regeol;
6018    inclasslen = loceol - scan;
6019    while (hardcount < max
6020     && ((inclasslen = loceol - scan) > 0)
6021     && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6022    {
6023     scan += inclasslen;
6024     hardcount++;
6025    }
6026   } else {
6027    while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6028     scan++;
6029   }
6030   break;
6031  case ALNUMU:
6032   if (utf8_target) {
6033  utf8_wordchar:
6034    loceol = PL_regeol;
6035    LOAD_UTF8_CHARCLASS_ALNUM();
6036    while (hardcount < max && scan < loceol &&
6037     swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6038    {
6039     scan += UTF8SKIP(scan);
6040     hardcount++;
6041    }
6042   } else {
6043    while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6044     scan++;
6045    }
6046   }
6047   break;
6048  case ALNUM:
6049   if (utf8_target)
6050    goto utf8_wordchar;
6051   while (scan < loceol && isALNUM((U8) *scan)) {
6052    scan++;
6053   }
6054   break;
6055  case ALNUMA:
6056   while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6057    scan++;
6058   }
6059   break;
6060  case ALNUML:
6061   PL_reg_flags |= RF_tainted;
6062   if (utf8_target) {
6063    loceol = PL_regeol;
6064    while (hardcount < max && scan < loceol &&
6065     isALNUM_LC_utf8((U8*)scan)) {
6066     scan += UTF8SKIP(scan);
6067     hardcount++;
6068    }
6069   } else {
6070    while (scan < loceol && isALNUM_LC(*scan))
6071     scan++;
6072   }
6073   break;
6074  case NALNUMU:
6075   if (utf8_target) {
6076
6077  utf8_Nwordchar:
6078
6079    loceol = PL_regeol;
6080    LOAD_UTF8_CHARCLASS_ALNUM();
6081    while (hardcount < max && scan < loceol &&
6082     ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6083    {
6084     scan += UTF8SKIP(scan);
6085     hardcount++;
6086    }
6087   } else {
6088    while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6089     scan++;
6090    }
6091   }
6092   break;
6093  case NALNUM:
6094   if (utf8_target)
6095    goto utf8_Nwordchar;
6096   while (scan < loceol && ! isALNUM((U8) *scan)) {
6097    scan++;
6098   }
6099   break;
6100  case NALNUMA:
6101   if (utf8_target) {
6102    while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6103     scan += UTF8SKIP(scan);
6104    }
6105   }
6106   else {
6107    while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6108     scan++;
6109    }
6110   }
6111   break;
6112  case NALNUML:
6113   PL_reg_flags |= RF_tainted;
6114   if (utf8_target) {
6115    loceol = PL_regeol;
6116    while (hardcount < max && scan < loceol &&
6117     !isALNUM_LC_utf8((U8*)scan)) {
6118     scan += UTF8SKIP(scan);
6119     hardcount++;
6120    }
6121   } else {
6122    while (scan < loceol && !isALNUM_LC(*scan))
6123     scan++;
6124   }
6125   break;
6126  case SPACEU:
6127   if (utf8_target) {
6128
6129  utf8_space:
6130
6131    loceol = PL_regeol;
6132    LOAD_UTF8_CHARCLASS_SPACE();
6133    while (hardcount < max && scan < loceol &&
6134     (*scan == ' ' ||
6135      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6136    {
6137     scan += UTF8SKIP(scan);
6138     hardcount++;
6139    }
6140    break;
6141   }
6142   else {
6143    while (scan < loceol && isSPACE_L1((U8) *scan)) {
6144     scan++;
6145    }
6146    break;
6147   }
6148  case SPACE:
6149   if (utf8_target)
6150    goto utf8_space;
6151
6152   while (scan < loceol && isSPACE((U8) *scan)) {
6153    scan++;
6154   }
6155   break;
6156  case SPACEA:
6157   while (scan < loceol && isSPACE_A((U8) *scan)) {
6158    scan++;
6159   }
6160   break;
6161  case SPACEL:
6162   PL_reg_flags |= RF_tainted;
6163   if (utf8_target) {
6164    loceol = PL_regeol;
6165    while (hardcount < max && scan < loceol &&
6166     isSPACE_LC_utf8((U8*)scan)) {
6167     scan += UTF8SKIP(scan);
6168     hardcount++;
6169    }
6170   } else {
6171    while (scan < loceol && isSPACE_LC(*scan))
6172     scan++;
6173   }
6174   break;
6175  case NSPACEU:
6176   if (utf8_target) {
6177
6178  utf8_Nspace:
6179
6180    loceol = PL_regeol;
6181    LOAD_UTF8_CHARCLASS_SPACE();
6182    while (hardcount < max && scan < loceol &&
6183     ! (*scan == ' ' ||
6184      swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6185    {
6186     scan += UTF8SKIP(scan);
6187     hardcount++;
6188    }
6189    break;
6190   }
6191   else {
6192    while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6193     scan++;
6194    }
6195   }
6196   break;
6197  case NSPACE:
6198   if (utf8_target)
6199    goto utf8_Nspace;
6200
6201   while (scan < loceol && ! isSPACE((U8) *scan)) {
6202    scan++;
6203   }
6204   break;
6205  case NSPACEA:
6206   if (utf8_target) {
6207    while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6208     scan += UTF8SKIP(scan);
6209    }
6210   }
6211   else {
6212    while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6213     scan++;
6214    }
6215   }
6216   break;
6217  case NSPACEL:
6218   PL_reg_flags |= RF_tainted;
6219   if (utf8_target) {
6220    loceol = PL_regeol;
6221    while (hardcount < max && scan < loceol &&
6222     !isSPACE_LC_utf8((U8*)scan)) {
6223     scan += UTF8SKIP(scan);
6224     hardcount++;
6225    }
6226   } else {
6227    while (scan < loceol && !isSPACE_LC(*scan))
6228     scan++;
6229   }
6230   break;
6231  case DIGIT:
6232   if (utf8_target) {
6233    loceol = PL_regeol;
6234    LOAD_UTF8_CHARCLASS_DIGIT();
6235    while (hardcount < max && scan < loceol &&
6236     swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6237     scan += UTF8SKIP(scan);
6238     hardcount++;
6239    }
6240   } else {
6241    while (scan < loceol && isDIGIT(*scan))
6242     scan++;
6243   }
6244   break;
6245  case DIGITA:
6246   while (scan < loceol && isDIGIT_A((U8) *scan)) {
6247    scan++;
6248   }
6249   break;
6250  case DIGITL:
6251   PL_reg_flags |= RF_tainted;
6252   if (utf8_target) {
6253    loceol = PL_regeol;
6254    while (hardcount < max && scan < loceol &&
6255     isDIGIT_LC_utf8((U8*)scan)) {
6256     scan += UTF8SKIP(scan);
6257     hardcount++;
6258    }
6259   } else {
6260    while (scan < loceol && isDIGIT_LC(*scan))
6261     scan++;
6262   }
6263   break;
6264  case NDIGIT:
6265   if (utf8_target) {
6266    loceol = PL_regeol;
6267    LOAD_UTF8_CHARCLASS_DIGIT();
6268    while (hardcount < max && scan < loceol &&
6269     !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6270     scan += UTF8SKIP(scan);
6271     hardcount++;
6272    }
6273   } else {
6274    while (scan < loceol && !isDIGIT(*scan))
6275     scan++;
6276   }
6277   break;
6278  case NDIGITA:
6279   if (utf8_target) {
6280    while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6281     scan += UTF8SKIP(scan);
6282    }
6283   }
6284   else {
6285    while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6286     scan++;
6287    }
6288   }
6289   break;
6290  case NDIGITL:
6291   PL_reg_flags |= RF_tainted;
6292   if (utf8_target) {
6293    loceol = PL_regeol;
6294    while (hardcount < max && scan < loceol &&
6295     !isDIGIT_LC_utf8((U8*)scan)) {
6296     scan += UTF8SKIP(scan);
6297     hardcount++;
6298    }
6299   } else {
6300    while (scan < loceol && !isDIGIT_LC(*scan))
6301     scan++;
6302   }
6303   break;
6304  case LNBREAK:
6305   if (utf8_target) {
6306    loceol = PL_regeol;
6307    while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6308     scan += c;
6309     hardcount++;
6310    }
6311   } else {
6312    /*
6313    LNBREAK can match two latin chars, which is ok,
6314    because we have a null terminated string, but we
6315    have to use hardcount in this situation
6316    */
6317    while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
6318     scan+=c;
6319     hardcount++;
6320    }
6321   }
6322   break;
6323  case HORIZWS:
6324   if (utf8_target) {
6325    loceol = PL_regeol;
6326    while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6327     scan += c;
6328     hardcount++;
6329    }
6330   } else {
6331    while (scan < loceol && is_HORIZWS_latin1(scan))
6332     scan++;
6333   }
6334   break;
6335  case NHORIZWS:
6336   if (utf8_target) {
6337    loceol = PL_regeol;
6338    while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6339     scan += UTF8SKIP(scan);
6340     hardcount++;
6341    }
6342   } else {
6343    while (scan < loceol && !is_HORIZWS_latin1(scan))
6344     scan++;
6345
6346   }
6347   break;
6348  case VERTWS:
6349   if (utf8_target) {
6350    loceol = PL_regeol;
6351    while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6352     scan += c;
6353     hardcount++;
6354    }
6355   } else {
6356    while (scan < loceol && is_VERTWS_latin1(scan))
6357     scan++;
6358
6359   }
6360   break;
6361  case NVERTWS:
6362   if (utf8_target) {
6363    loceol = PL_regeol;
6364    while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6365     scan += UTF8SKIP(scan);
6366     hardcount++;
6367    }
6368   } else {
6369    while (scan < loceol && !is_VERTWS_latin1(scan))
6370     scan++;
6371
6372   }
6373   break;
6374
6375  default:  /* Called on something of 0 width. */
6376   break;  /* So match right here or not at all. */
6377  }
6378
6379  if (hardcount)
6380   c = hardcount;
6381  else
6382   c = scan - PL_reginput;
6383  PL_reginput = scan;
6384
6385  DEBUG_r({
6386   GET_RE_DEBUG_FLAGS_DECL;
6387   DEBUG_EXECUTE_r({
6388    SV * const prop = sv_newmortal();
6389    regprop(prog, prop, p);
6390    PerlIO_printf(Perl_debug_log,
6391       "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
6392       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6393   });
6394  });
6395
6396  return(c);
6397 }
6398
6399
6400 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6401 /*
6402 - regclass_swash - prepare the utf8 swash
6403 */
6404
6405 SV *
6406 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6407 {
6408  dVAR;
6409  SV *sw  = NULL;
6410  SV *si  = NULL;
6411  SV *alt = NULL;
6412  RXi_GET_DECL(prog,progi);
6413  const struct reg_data * const data = prog ? progi->data : NULL;
6414
6415  PERL_ARGS_ASSERT_REGCLASS_SWASH;
6416
6417  assert(ANYOF_NONBITMAP(node));
6418
6419  if (data && data->count) {
6420   const U32 n = ARG(node);
6421
6422   if (data->what[n] == 's') {
6423    SV * const rv = MUTABLE_SV(data->data[n]);
6424    AV * const av = MUTABLE_AV(SvRV(rv));
6425    SV **const ary = AvARRAY(av);
6426    SV **a, **b;
6427
6428    /* See the end of regcomp.c:S_regclass() for
6429    * documentation of these array elements. */
6430
6431    si = *ary;
6432    a  = SvROK(ary[1]) ? &ary[1] : NULL;
6433    b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6434
6435    if (a)
6436     sw = *a;
6437    else if (si && doinit) {
6438     sw = swash_init("utf8", "", si, 1, 0);
6439     (void)av_store(av, 1, sw);
6440    }
6441    if (b)
6442     alt = *b;
6443   }
6444  }
6445
6446  if (listsvp)
6447   *listsvp = si;
6448  if (altsvp)
6449   *altsvp  = alt;
6450
6451  return sw;
6452 }
6453 #endif
6454
6455 /*
6456  - reginclass - determine if a character falls into a character class
6457
6458   n is the ANYOF regnode
6459   p is the target string
6460   lenp is pointer to the maximum number of bytes of how far to go in p
6461  (This is assumed wthout checking to always be at least the current
6462  character's size)
6463   utf8_target tells whether p is in UTF-8.
6464
6465   Returns true if matched; false otherwise.  If lenp is not NULL, on return
6466   from a successful match, the value it points to will be updated to how many
6467   bytes in p were matched.  If there was no match, the value is undefined,
6468   possibly changed from the input.
6469
6470   Note that this can be a synthetic start class, a combination of various
6471   nodes, so things you think might be mutually exclusive, such as locale,
6472   aren't.  It can match both locale and non-locale
6473
6474  */
6475
6476 STATIC bool
6477 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6478 {
6479  dVAR;
6480  const char flags = ANYOF_FLAGS(n);
6481  bool match = FALSE;
6482  UV c = *p;
6483  STRLEN c_len = 0;
6484  STRLEN maxlen;
6485
6486  PERL_ARGS_ASSERT_REGINCLASS;
6487
6488  /* If c is not already the code point, get it */
6489  if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6490   c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6491     (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6492     | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6493     /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6494     * UTF8_ALLOW_FFFF */
6495   if (c_len == (STRLEN)-1)
6496    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6497  }
6498  else {
6499   c_len = 1;
6500  }
6501
6502  /* Use passed in max length, or one character if none passed in or less
6503  * than one character.  And assume will match just one character.  This is
6504  * overwritten later if matched more. */
6505  if (lenp) {
6506   maxlen = (*lenp > c_len) ? *lenp : c_len;
6507   *lenp = c_len;
6508
6509  }
6510  else {
6511   maxlen = c_len;
6512  }
6513
6514  /* If this character is potentially in the bitmap, check it */
6515  if (c < 256) {
6516   if (ANYOF_BITMAP_TEST(n, c))
6517    match = TRUE;
6518   else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6519     && ! utf8_target
6520     && ! isASCII(c))
6521   {
6522    match = TRUE;
6523   }
6524
6525   else if (flags & ANYOF_LOCALE) {
6526    PL_reg_flags |= RF_tainted;
6527
6528    if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6529     && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6530    {
6531     match = TRUE;
6532    }
6533    else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6534      ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6535      (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6536      (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6537      (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6538      (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6539      (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6540      (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6541      (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6542      (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6543      (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6544      (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6545      (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6546      (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6547      (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6548      (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6549      (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6550      (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6551      (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6552      (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6553      (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6554      (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6555      (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6556      (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6557      (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6558      (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6559      (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6560      (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6561      (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6562      (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6563      (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6564      ) /* How's that for a conditional? */
6565    ) {
6566     match = TRUE;
6567    }
6568   }
6569  }
6570
6571  /* If the bitmap didn't (or couldn't) match, and something outside the
6572  * bitmap could match, try that.  Locale nodes specifiy completely the
6573  * behavior of code points in the bit map (otherwise, a utf8 target would
6574  * cause them to be treated as Unicode and not locale), except in
6575  * the very unlikely event when this node is a synthetic start class, which
6576  * could be a combination of locale and non-locale nodes.  So allow locale
6577  * to match for the synthetic start class, which will give a false
6578  * positive that will be resolved when the match is done again as not part
6579  * of the synthetic start class */
6580  if (!match) {
6581   if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6582    match = TRUE; /* Everything above 255 matches */
6583   }
6584   else if (ANYOF_NONBITMAP(n)
6585     && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6586      || (utf8_target
6587       && (c >=256
6588        || (! (flags & ANYOF_LOCALE))
6589        || (flags & ANYOF_IS_SYNTHETIC)))))
6590   {
6591    AV *av;
6592    SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6593
6594    if (sw) {
6595     U8 * utf8_p;
6596     if (utf8_target) {
6597      utf8_p = (U8 *) p;
6598     } else {
6599
6600      /* Not utf8.  Convert as much of the string as available up
6601      * to the limit of how far the (single) character in the
6602      * pattern can possibly match (no need to go further).  If
6603      * the node is a straight ANYOF or not folding, it can't
6604      * match more than one.  Otherwise, It can match up to how
6605      * far a single char can fold to.  Since not utf8, each
6606      * character is a single byte, so the max it can be in
6607      * bytes is the same as the max it can be in characters */
6608      STRLEN len = (OP(n) == ANYOF
6609         || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
6610         ? 1
6611         : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
6612          ? maxlen
6613          : UTF8_MAX_FOLD_CHAR_EXPAND;
6614      utf8_p = bytes_to_utf8(p, &len);
6615     }
6616
6617     if (swash_fetch(sw, utf8_p, TRUE))
6618      match = TRUE;
6619     else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
6620
6621      /* Here, we need to test if the fold of the target string
6622      * matches.  The non-multi char folds have all been moved to
6623      * the compilation phase, and the multi-char folds have
6624      * been stored by regcomp into 'av'; we linearly check to
6625      * see if any match the target string (folded).   We know
6626      * that the originals were each one character, but we don't
6627      * currently know how many characters/bytes each folded to,
6628      * except we do know that there are small limits imposed by
6629      * Unicode.  XXX A performance enhancement would be to have
6630      * regcomp.c store the max number of chars/bytes that are
6631      * in an av entry, as, say the 0th element.  Even better
6632      * would be to have a hash of the few characters that can
6633      * start a multi-char fold to the max number of chars of
6634      * those folds.
6635      *
6636      * If there is a match, we will need to advance (if lenp is
6637      * specified) the match pointer in the target string.  But
6638      * what we are comparing here isn't that string directly,
6639      * but its fold, whose length may differ from the original.
6640      * As we go along in constructing the fold, therefore, we
6641      * create a map so that we know how many bytes in the
6642      * source to advance given that we have matched a certain
6643      * number of bytes in the fold.  This map is stored in
6644      * 'map_fold_len_back'.  Let n mean the number of bytes in
6645      * the fold of the first character that we are folding.
6646      * Then map_fold_len_back[n] is set to the number of bytes
6647      * in that first character.  Similarly let m be the
6648      * corresponding number for the second character to be
6649      * folded.  Then map_fold_len_back[n+m] is set to the
6650      * number of bytes occupied by the first two source
6651      * characters. ... */
6652      U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
6653      U8 folded[UTF8_MAXBYTES_CASE+1];
6654      STRLEN foldlen = 0; /* num bytes in fold of 1st char */
6655      STRLEN total_foldlen = 0; /* num bytes in fold of all
6656             chars */
6657
6658      if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
6659
6660       /* Here, only need to fold the first char of the target
6661       * string.  It the source wasn't utf8, is 1 byte long */
6662       to_utf8_fold(utf8_p, folded, &foldlen);
6663       total_foldlen = foldlen;
6664       map_fold_len_back[foldlen] = (utf8_target)
6665              ? UTF8SKIP(utf8_p)
6666              : 1;
6667      }
6668      else {
6669
6670       /* Here, need to fold more than the first char.  Do so
6671       * up to the limits */
6672       U8* source_ptr = utf8_p;    /* The source for the fold
6673              is the regex target
6674              string */
6675       U8* folded_ptr = folded;
6676       U8* e = utf8_p + maxlen;    /* Can't go beyond last
6677              available byte in the
6678              target string */
6679       U8 i;
6680       for (i = 0;
6681        i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
6682        i++)
6683       {
6684
6685        /* Fold the next character */
6686        U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
6687        STRLEN this_char_foldlen;
6688        to_utf8_fold(source_ptr,
6689           this_char_folded,
6690           &this_char_foldlen);
6691
6692        /* Bail if it would exceed the byte limit for
6693        * folding a single char. */
6694        if (this_char_foldlen + folded_ptr - folded >
6695                UTF8_MAXBYTES_CASE)
6696        {
6697         break;
6698        }
6699
6700        /* Add the fold of this character */
6701        Copy(this_char_folded,
6702         folded_ptr,
6703         this_char_foldlen,
6704         U8);
6705        source_ptr += UTF8SKIP(source_ptr);
6706        folded_ptr += this_char_foldlen;
6707        total_foldlen = folded_ptr - folded;
6708
6709        /* Create map from the number of bytes in the fold
6710        * back to the number of bytes in the source.  If
6711        * the source isn't utf8, the byte count is just
6712        * the number of characters so far */
6713        map_fold_len_back[total_foldlen]
6714              = (utf8_target)
6715               ? source_ptr - utf8_p
6716               : i + 1;
6717       }
6718       *folded_ptr = '\0';
6719      }
6720
6721
6722      /* Do the linear search to see if the fold is in the list
6723      * of multi-char folds. */
6724      if (av) {
6725       I32 i;
6726       for (i = 0; i <= av_len(av); i++) {
6727        SV* const sv = *av_fetch(av, i, FALSE);
6728        STRLEN len;
6729        const char * const s = SvPV_const(sv, len);
6730
6731        if (len <= total_foldlen && memEQ(s,
6732                (char*)folded,
6733                len))
6734        {
6735
6736         /* Advance the target string ptr to account for
6737         * this fold, but have to translate from the
6738         * folded length to the corresponding source
6739         * length. */
6740         if (lenp) {
6741          *lenp = map_fold_len_back[len];
6742          assert(*lenp != 0); /* Otherwise will loop */
6743         }
6744         match = TRUE;
6745         break;
6746        }
6747       }
6748      }
6749     }
6750
6751     /* If we allocated a string above, free it */
6752     if (! utf8_target) Safefree(utf8_p);
6753    }
6754   }
6755  }
6756
6757  return (flags & ANYOF_INVERT) ? !match : match;
6758 }
6759
6760 STATIC U8 *
6761 S_reghop3(U8 *s, I32 off, const U8* lim)
6762 {
6763  dVAR;
6764
6765  PERL_ARGS_ASSERT_REGHOP3;
6766
6767  if (off >= 0) {
6768   while (off-- && s < lim) {
6769    /* XXX could check well-formedness here */
6770    s += UTF8SKIP(s);
6771   }
6772  }
6773  else {
6774   while (off++ && s > lim) {
6775    s--;
6776    if (UTF8_IS_CONTINUED(*s)) {
6777     while (s > lim && UTF8_IS_CONTINUATION(*s))
6778      s--;
6779    }
6780    /* XXX could check well-formedness here */
6781   }
6782  }
6783  return s;
6784 }
6785
6786 #ifdef XXX_dmq
6787 /* there are a bunch of places where we use two reghop3's that should
6788    be replaced with this routine. but since thats not done yet
6789    we ifdef it out - dmq
6790 */
6791 STATIC U8 *
6792 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6793 {
6794  dVAR;
6795
6796  PERL_ARGS_ASSERT_REGHOP4;
6797
6798  if (off >= 0) {
6799   while (off-- && s < rlim) {
6800    /* XXX could check well-formedness here */
6801    s += UTF8SKIP(s);
6802   }
6803  }
6804  else {
6805   while (off++ && s > llim) {
6806    s--;
6807    if (UTF8_IS_CONTINUED(*s)) {
6808     while (s > llim && UTF8_IS_CONTINUATION(*s))
6809      s--;
6810    }
6811    /* XXX could check well-formedness here */
6812   }
6813  }
6814  return s;
6815 }
6816 #endif
6817
6818 STATIC U8 *
6819 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6820 {
6821  dVAR;
6822
6823  PERL_ARGS_ASSERT_REGHOPMAYBE3;
6824
6825  if (off >= 0) {
6826   while (off-- && s < lim) {
6827    /* XXX could check well-formedness here */
6828    s += UTF8SKIP(s);
6829   }
6830   if (off >= 0)
6831    return NULL;
6832  }
6833  else {
6834   while (off++ && s > lim) {
6835    s--;
6836    if (UTF8_IS_CONTINUED(*s)) {
6837     while (s > lim && UTF8_IS_CONTINUATION(*s))
6838      s--;
6839    }
6840    /* XXX could check well-formedness here */
6841   }
6842   if (off <= 0)
6843    return NULL;
6844  }
6845  return s;
6846 }
6847
6848 static void
6849 restore_pos(pTHX_ void *arg)
6850 {
6851  dVAR;
6852  regexp * const rex = (regexp *)arg;
6853  if (PL_reg_eval_set) {
6854   if (PL_reg_oldsaved) {
6855    rex->subbeg = PL_reg_oldsaved;
6856    rex->sublen = PL_reg_oldsavedlen;
6857 #ifdef PERL_OLD_COPY_ON_WRITE
6858    rex->saved_copy = PL_nrs;
6859 #endif
6860    RXp_MATCH_COPIED_on(rex);
6861   }
6862   PL_reg_magic->mg_len = PL_reg_oldpos;
6863   PL_reg_eval_set = 0;
6864   PL_curpm = PL_reg_oldcurpm;
6865  }
6866 }
6867
6868 STATIC void
6869 S_to_utf8_substr(pTHX_ register regexp *prog)
6870 {
6871  int i = 1;
6872
6873  PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6874
6875  do {
6876   if (prog->substrs->data[i].substr
6877    && !prog->substrs->data[i].utf8_substr) {
6878    SV* const sv = newSVsv(prog->substrs->data[i].substr);
6879    prog->substrs->data[i].utf8_substr = sv;
6880    sv_utf8_upgrade(sv);
6881    if (SvVALID(prog->substrs->data[i].substr)) {
6882     const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6883     if (flags & FBMcf_TAIL) {
6884      /* Trim the trailing \n that fbm_compile added last
6885      time.  */
6886      SvCUR_set(sv, SvCUR(sv) - 1);
6887      /* Whilst this makes the SV technically "invalid" (as its
6888      buffer is no longer followed by "\0") when fbm_compile()
6889      adds the "\n" back, a "\0" is restored.  */
6890     }
6891     fbm_compile(sv, flags);
6892    }
6893    if (prog->substrs->data[i].substr == prog->check_substr)
6894     prog->check_utf8 = sv;
6895   }
6896  } while (i--);
6897 }
6898
6899 STATIC void
6900 S_to_byte_substr(pTHX_ register regexp *prog)
6901 {
6902  dVAR;
6903  int i = 1;
6904
6905  PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6906
6907  do {
6908   if (prog->substrs->data[i].utf8_substr
6909    && !prog->substrs->data[i].substr) {
6910    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6911    if (sv_utf8_downgrade(sv, TRUE)) {
6912     if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6913      const U8 flags
6914       = BmFLAGS(prog->substrs->data[i].utf8_substr);
6915      if (flags & FBMcf_TAIL) {
6916       /* Trim the trailing \n that fbm_compile added last
6917       time.  */
6918       SvCUR_set(sv, SvCUR(sv) - 1);
6919      }
6920      fbm_compile(sv, flags);
6921     }
6922    } else {
6923     SvREFCNT_dec(sv);
6924     sv = &PL_sv_undef;
6925    }
6926    prog->substrs->data[i].substr = sv;
6927    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6928     prog->check_substr = sv;
6929   }
6930  } while (i--);
6931 }
6932
6933 /*
6934  * Local variables:
6935  * c-indentation-style: bsd
6936  * c-basic-offset: 4
6937  * indent-tabs-mode: t
6938  * End:
6939  *
6940  * ex: set ts=8 sts=4 sw=4 noet:
6941  */