src/5019002/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *  One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43     = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47  goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  * Copyright (c) 1986 by University of Toronto.
  54  * Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  * Permission is granted to anyone to use this software for any
  57  * purpose on any computer system, and to redistribute it freely,
  58  * subject to the following restrictions:
  59  *
  60  * 1. The author is not responsible for the consequences of use of
  61  *  this software, no matter how awful, even if they arise
  62  *  from defects in it.
  63  *
  64  * 2. The origin of this software must not be misrepresented, either
  65  *  by explicit claim or by omission.
  66  *
  67  * 3. Altered versions must be plainly marked as such, and must not
  68  *  be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86 #include "re_defs.h"
  87
  88 #ifdef PERL_IN_XSUB_RE
  89 #  include "re_comp.h"
  90 #else
  91 #  include "regcomp.h"
  92 #endif
  93
  94 #include "inline_invlist.c"
  95 #include "unicode_constants.h"
  96
  97 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98
  99 #ifndef STATIC
 100 #define STATIC static
 101 #endif
 102
 103 /* Valid for non-utf8 strings: avoids the reginclass
 104  * call if there are no complications: i.e., if everything matchable is
 105  * straight forward in the bitmap */
 106 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 107            : ANYOF_BITMAP_TEST(p,*(c)))
 108
 109 /*
 110  * Forwards.
 111  */
 112
 113 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 114 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 115
 116 #define HOPc(pos,off) \
 117   (char *)(reginfo->is_utf8_target \
 118    ? reghop3((U8*)pos, off, \
 119      (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 120    : (U8*)(pos + off))
 121 #define HOPBACKc(pos, off) \
 122   (char*)(reginfo->is_utf8_target \
 123    ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 124    : (pos - off >= reginfo->strbeg) \
 125     ? (U8*)pos - off  \
 126     : NULL)
 127
 128 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 129 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 130
 131
 132 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 133 #define NEXTCHR_IS_EOS (nextchr < 0)
 134
 135 #define SET_nextchr \
 136  nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 137
 138 #define SET_locinput(p) \
 139  locinput = (p);  \
 140  SET_nextchr
 141
 142
 143 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START {            \
 144   if (!swash_ptr) {                                                     \
 145    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 146    swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 147           1, 0, NULL, &flags);                 \
 148    assert(swash_ptr);                                                \
 149   }                                                                     \
 150  } STMT_END
 151
 152 /* If in debug mode, we test that a known character properly matches */
 153 #ifdef DEBUGGING
 154 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 155           property_name,                      \
 156           utf8_char_in_property)              \
 157   LOAD_UTF8_CHARCLASS(swash_ptr, property_name);                        \
 158   assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 159 #else
 160 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 161           property_name,                      \
 162           utf8_char_in_property)              \
 163   LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
 164 #endif
 165
 166 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 167           PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 168           swash_property_names[_CC_WORDCHAR],   \
 169           GREEK_SMALL_LETTER_IOTA_UTF8)
 170
 171 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 172  STMT_START {                                                              \
 173   LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 174          "_X_regular_begin",                    \
 175          GREEK_SMALL_LETTER_IOTA_UTF8);         \
 176   LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 177          "_X_extend",                           \
 178          COMBINING_GRAVE_ACCENT_UTF8);          \
 179  } STMT_END
 180
 181 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
 182 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 183
 184 /* for use after a quantifier and before an EXACT-like node -- japhy */
 185 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 186  *
 187  * NOTE that *nothing* that affects backtracking should be in here, specifically
 188  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 189  * node that is in between two EXACT like nodes when ascertaining what the required
 190  * "follow" character is. This should probably be moved to regex compile time
 191  * although it may be done at run time beause of the REF possibility - more
 192  * investigation required. -- demerphq
 193 */
 194 #define JUMPABLE(rn) (      \
 195  OP(rn) == OPEN ||       \
 196  (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 197  OP(rn) == EVAL ||   \
 198  OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 199  OP(rn) == PLUS || OP(rn) == MINMOD || \
 200  OP(rn) == KEEPS || \
 201  (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 202 )
 203 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 204
 205 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 206
 207 #if 0
 208 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 209    we don't need this definition. */
 210 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 211 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 212 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 213
 214 #else
 215 /* ... so we use this as its faster. */
 216 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 217 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 218 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 219 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 220
 221 #endif
 222
 223 /*
 224   Search for mandatory following text node; for lookahead, the text must
 225   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 226 */
 227 #define FIND_NEXT_IMPT(rn) STMT_START { \
 228  while (JUMPABLE(rn)) { \
 229   const OPCODE type = OP(rn); \
 230   if (type == SUSPEND || PL_regkind[type] == CURLY) \
 231    rn = NEXTOPER(NEXTOPER(rn)); \
 232   else if (type == PLUS) \
 233    rn = NEXTOPER(rn); \
 234   else if (type == IFMATCH) \
 235    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 236   else rn += NEXT_OFF(rn); \
 237  } \
 238 } STMT_END
 239
 240 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 241  * These are for the pre-composed Hangul syllables, which are all in a
 242  * contiguous block and arranged there in such a way so as to facilitate
 243  * alorithmic determination of their characteristics.  As such, they don't need
 244  * a swash, but can be determined by simple arithmetic.  Almost all are
 245  * GCB=LVT, but every 28th one is a GCB=LV */
 246 #define SBASE 0xAC00    /* Start of block */
 247 #define SCount 11172    /* Length of block */
 248 #define TCount 28
 249
 250 #define SLAB_FIRST(s) (&(s)->states[0])
 251 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 252
 253 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 254 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 255 static regmatch_state * S_push_slab(pTHX);
 256
 257 #define REGCP_PAREN_ELEMS 3
 258 #define REGCP_OTHER_ELEMS 3
 259 #define REGCP_FRAME_ELEMS 1
 260 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 261  * are needed for the regexp context stack bookkeeping. */
 262
 263 STATIC CHECKPOINT
 264 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 265 {
 266  dVAR;
 267  const int retval = PL_savestack_ix;
 268  const int paren_elems_to_push =
 269     (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 270  const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 271  const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 272  I32 p;
 273  GET_RE_DEBUG_FLAGS_DECL;
 274
 275  PERL_ARGS_ASSERT_REGCPPUSH;
 276
 277  if (paren_elems_to_push < 0)
 278   Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 279     paren_elems_to_push);
 280
 281  if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 282   Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 283     " out of range (%lu-%ld)",
 284     total_elems,
 285     (unsigned long)maxopenparen,
 286     (long)parenfloor);
 287
 288  SSGROW(total_elems + REGCP_FRAME_ELEMS);
 289
 290  DEBUG_BUFFERS_r(
 291   if ((int)maxopenparen > (int)parenfloor)
 292    PerlIO_printf(Perl_debug_log,
 293     "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 294     PTR2UV(rex),
 295     PTR2UV(rex->offs)
 296    );
 297  );
 298  for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 299 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 300   SSPUSHINT(rex->offs[p].end);
 301   SSPUSHINT(rex->offs[p].start);
 302   SSPUSHINT(rex->offs[p].start_tmp);
 303   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 304    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 305    (UV)p,
 306    (IV)rex->offs[p].start,
 307    (IV)rex->offs[p].start_tmp,
 308    (IV)rex->offs[p].end
 309   ));
 310  }
 311 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 312  SSPUSHINT(maxopenparen);
 313  SSPUSHINT(rex->lastparen);
 314  SSPUSHINT(rex->lastcloseparen);
 315  SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 316
 317  return retval;
 318 }
 319
 320 /* These are needed since we do not localize EVAL nodes: */
 321 #define REGCP_SET(cp)                                           \
 322  DEBUG_STATE_r(                                              \
 323    PerlIO_printf(Perl_debug_log,          \
 324     "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 325     (IV)PL_savestack_ix));                          \
 326  cp = PL_savestack_ix
 327
 328 #define REGCP_UNWIND(cp)                                        \
 329  DEBUG_STATE_r(                                              \
 330   if (cp != PL_savestack_ix)                   \
 331     PerlIO_printf(Perl_debug_log,          \
 332     "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 333     (IV)(cp), (IV)PL_savestack_ix));                \
 334  regcpblow(cp)
 335
 336 #define UNWIND_PAREN(lp, lcp)               \
 337  for (n = rex->lastparen; n > lp; n--)   \
 338   rex->offs[n].end = -1;              \
 339  rex->lastparen = n;                     \
 340  rex->lastcloseparen = lcp;
 341
 342
 343 STATIC void
 344 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 345 {
 346  dVAR;
 347  UV i;
 348  U32 paren;
 349  GET_RE_DEBUG_FLAGS_DECL;
 350
 351  PERL_ARGS_ASSERT_REGCPPOP;
 352
 353  /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 354  i = SSPOPUV;
 355  assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 356  i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 357  rex->lastcloseparen = SSPOPINT;
 358  rex->lastparen = SSPOPINT;
 359  *maxopenparen_p = SSPOPINT;
 360
 361  i -= REGCP_OTHER_ELEMS;
 362  /* Now restore the parentheses context. */
 363  DEBUG_BUFFERS_r(
 364   if (i || rex->lastparen + 1 <= rex->nparens)
 365    PerlIO_printf(Perl_debug_log,
 366     "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 367     PTR2UV(rex),
 368     PTR2UV(rex->offs)
 369    );
 370  );
 371  paren = *maxopenparen_p;
 372  for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 373   I32 tmps;
 374   rex->offs[paren].start_tmp = SSPOPINT;
 375   rex->offs[paren].start = SSPOPINT;
 376   tmps = SSPOPINT;
 377   if (paren <= rex->lastparen)
 378    rex->offs[paren].end = tmps;
 379   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 380    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 381    (UV)paren,
 382    (IV)rex->offs[paren].start,
 383    (IV)rex->offs[paren].start_tmp,
 384    (IV)rex->offs[paren].end,
 385    (paren > rex->lastparen ? "(skipped)" : ""));
 386   );
 387   paren--;
 388  }
 389 #if 1
 390  /* It would seem that the similar code in regtry()
 391  * already takes care of this, and in fact it is in
 392  * a better location to since this code can #if 0-ed out
 393  * but the code in regtry() is needed or otherwise tests
 394  * requiring null fields (pat.t#187 and split.t#{13,14}
 395  * (as of patchlevel 7877)  will fail.  Then again,
 396  * this code seems to be necessary or otherwise
 397  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 398  * --jhi updated by dapm */
 399  for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 400   if (i > *maxopenparen_p)
 401    rex->offs[i].start = -1;
 402   rex->offs[i].end = -1;
 403   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 404    "    \\%"UVuf": %s   ..-1 undeffing\n",
 405    (UV)i,
 406    (i > *maxopenparen_p) ? "-1" : "  "
 407   ));
 408  }
 409 #endif
 410 }
 411
 412 /* restore the parens and associated vars at savestack position ix,
 413  * but without popping the stack */
 414
 415 STATIC void
 416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 417 {
 418  I32 tmpix = PL_savestack_ix;
 419  PL_savestack_ix = ix;
 420  regcppop(rex, maxopenparen_p);
 421  PL_savestack_ix = tmpix;
 422 }
 423
 424 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
 425
 426 STATIC bool
 427 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 428 {
 429  /* Returns a boolean as to whether or not 'character' is a member of the
 430  * Posix character class given by 'classnum' that should be equivalent to a
 431  * value in the typedef '_char_class_number'.
 432  *
 433  * Ideally this could be replaced by a just an array of function pointers
 434  * to the C library functions that implement the macros this calls.
 435  * However, to compile, the precise function signatures are required, and
 436  * these may vary from platform to to platform.  To avoid having to figure
 437  * out what those all are on each platform, I (khw) am using this method,
 438  * which adds an extra layer of function call overhead (unless the C
 439  * optimizer strips it away).  But we don't particularly care about
 440  * performance with locales anyway. */
 441
 442  switch ((_char_class_number) classnum) {
 443   case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 444   case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 445   case _CC_ENUM_ASCII:     return isASCII_LC(character);
 446   case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 447   case _CC_ENUM_CASED:     return isLOWER_LC(character)
 448           || isUPPER_LC(character);
 449   case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 450   case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 451   case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 452   case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 453   case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 454   case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 455   case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 456   case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 457   case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 458   case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 459   case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 460   default:    /* VERTSPACE should never occur in locales */
 461    Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 462  }
 463
 464  assert(0); /* NOTREACHED */
 465  return FALSE;
 466 }
 467
 468 STATIC bool
 469 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 470 {
 471  /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 472  * 'character' is a member of the Posix character class given by 'classnum'
 473  * that should be equivalent to a value in the typedef
 474  * '_char_class_number'.
 475  *
 476  * This just calls isFOO_lc on the code point for the character if it is in
 477  * the range 0-255.  Outside that range, all characters avoid Unicode
 478  * rules, ignoring any locale.  So use the Unicode function if this class
 479  * requires a swash, and use the Unicode macro otherwise. */
 480
 481  PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 482
 483  if (UTF8_IS_INVARIANT(*character)) {
 484   return isFOO_lc(classnum, *character);
 485  }
 486  else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 487   return isFOO_lc(classnum,
 488       TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
 489  }
 490
 491  if (classnum < _FIRST_NON_SWASH_CC) {
 492
 493   /* Initialize the swash unless done already */
 494   if (! PL_utf8_swash_ptrs[classnum]) {
 495    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 496    PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
 497     swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
 498   }
 499
 500   return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 501         character,
 502         TRUE /* is UTF */ ));
 503  }
 504
 505  switch ((_char_class_number) classnum) {
 506   case _CC_ENUM_SPACE:
 507   case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 508
 509   case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 510   case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 511   case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 512   default:                 return 0;  /* Things like CNTRL are always
 513            below 256 */
 514  }
 515
 516  assert(0); /* NOTREACHED */
 517  return FALSE;
 518 }
 519
 520 /*
 521  * pregexec and friends
 522  */
 523
 524 #ifndef PERL_IN_XSUB_RE
 525 /*
 526  - pregexec - match a regexp against a string
 527  */
 528 I32
 529 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 530   char *strbeg, I32 minend, SV *screamer, U32 nosave)
 531 /* stringarg: the point in the string at which to begin matching */
 532 /* strend:    pointer to null at end of string */
 533 /* strbeg:    real beginning of string */
 534 /* minend:    end of match must be >= minend bytes after stringarg. */
 535 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 536  *            itself is accessed via the pointers above */
 537 /* nosave:    For optimizations. */
 538 {
 539  PERL_ARGS_ASSERT_PREGEXEC;
 540
 541  return
 542   regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 543      nosave ? 0 : REXEC_COPY_STR);
 544 }
 545 #endif
 546
 547 /*
 548  * Need to implement the following flags for reg_anch:
 549  *
 550  * USE_INTUIT_NOML  - Useful to call re_intuit_start() first
 551  * USE_INTUIT_ML
 552  * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
 553  * INTUIT_AUTORITATIVE_ML
 554  * INTUIT_ONCE_NOML  - Intuit can match in one location only.
 555  * INTUIT_ONCE_ML
 556  *
 557  * Another flag for this function: SECOND_TIME (so that float substrs
 558  * with giant delta may be not rechecked).
 559  */
 560
 561 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 562
 563 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 564    Otherwise, only SvCUR(sv) is used to get strbeg. */
 565
 566 /* XXXX We assume that strpos is strbeg unless sv. */
 567
 568 /* XXXX Some places assume that there is a fixed substring.
 569   An update may be needed if optimizer marks as "INTUITable"
 570   RExen without fixed substrings.  Similarly, it is assumed that
 571   lengths of all the strings are no more than minlen, thus they
 572   cannot come from lookahead.
 573   (Or minlen should take into account lookahead.)
 574   NOTE: Some of this comment is not correct. minlen does now take account
 575   of lookahead/behind. Further research is required. -- demerphq
 576
 577 */
 578
 579 /* A failure to find a constant substring means that there is no need to make
 580    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 581    finding a substring too deep into the string means that fewer calls to
 582    regtry() should be needed.
 583
 584    REx compiler's optimizer found 4 possible hints:
 585   a) Anchored substring;
 586   b) Fixed substring;
 587   c) Whether we are anchored (beginning-of-line or \G);
 588   d) First node (of those at offset 0) which may distinguish positions;
 589    We use a)b)d) and multiline-part of c), and try to find a position in the
 590    string which does not contradict any of them.
 591  */
 592
 593 /* Most of decisions we do here should have been done at compile time.
 594    The nodes of the REx which we used for the search should have been
 595    deleted from the finite automaton. */
 596
 597 /* args:
 598  * rx:     the regex to match against
 599  * sv:     the SV being matched: only used for utf8 flag; the string
 600  *         itself is accessed via the pointers below. Note that on
 601  *         something like an overloaded SV, SvPOK(sv) may be false
 602  *         and the string pointers may point to something unrelated to
 603  *         the SV itself.
 604  * strbeg: real beginning of string
 605  * strpos: the point in the string at which to begin matching
 606  * strend: pointer to the byte following the last char of the string
 607  * flags   currently unused; set to 0
 608  * data:   currently unused; set to NULL
 609  */
 610
 611 char *
 612 Perl_re_intuit_start(pTHX_
 613      REGEXP * const rx,
 614      SV *sv,
 615      const char * const strbeg,
 616      char *strpos,
 617      char *strend,
 618      const U32 flags,
 619      re_scream_pos_data *data)
 620 {
 621  dVAR;
 622  struct regexp *const prog = ReANY(rx);
 623  I32 start_shift = 0;
 624  /* Should be nonnegative! */
 625  I32 end_shift   = 0;
 626  char *s;
 627  SV *check;
 628  char *t;
 629  const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 630  I32 ml_anch;
 631  char *other_last = NULL; /* other substr checked before this */
 632  char *check_at = NULL;  /* check substr found at this pos */
 633  char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 634  const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 635  RXi_GET_DECL(prog,progi);
 636  regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 637  regmatch_info *const reginfo = &reginfo_buf;
 638 #ifdef DEBUGGING
 639  const char * const i_strpos = strpos;
 640 #endif
 641  GET_RE_DEBUG_FLAGS_DECL;
 642
 643  PERL_ARGS_ASSERT_RE_INTUIT_START;
 644  PERL_UNUSED_ARG(flags);
 645  PERL_UNUSED_ARG(data);
 646
 647  /* CHR_DIST() would be more correct here but it makes things slow. */
 648  if (prog->minlen > strend - strpos) {
 649   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 650        "String too short... [re_intuit_start]\n"));
 651   goto fail;
 652  }
 653
 654  reginfo->is_utf8_target = cBOOL(utf8_target);
 655  reginfo->info_aux = NULL;
 656  reginfo->strbeg = strbeg;
 657  reginfo->strend = strend;
 658  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 659  reginfo->intuit = 1;
 660  /* not actually used within intuit, but zero for safety anyway */
 661  reginfo->poscache_maxiter = 0;
 662
 663  if (utf8_target) {
 664   if (!prog->check_utf8 && prog->check_substr)
 665    to_utf8_substr(prog);
 666   check = prog->check_utf8;
 667  } else {
 668   if (!prog->check_substr && prog->check_utf8) {
 669    if (! to_byte_substr(prog)) {
 670     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 671    }
 672   }
 673   check = prog->check_substr;
 674  }
 675  if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
 676   ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 677      || ( (prog->extflags & RXf_ANCH_BOL)
 678       && !multiline ) ); /* Check after \n? */
 679
 680   if (!ml_anch) {
 681   if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 682     && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 683    && (strpos != strbeg)) {
 684    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 685    goto fail;
 686   }
 687   if (prog->check_offset_min == prog->check_offset_max
 688    && !(prog->extflags & RXf_CANY_SEEN)
 689    && ! multiline)   /* /m can cause \n's to match that aren't
 690         accounted for in the string max length.
 691         See [perl #115242] */
 692   {
 693    /* Substring at constant offset from beg-of-str... */
 694    I32 slen;
 695
 696    s = HOP3c(strpos, prog->check_offset_min, strend);
 697
 698    if (SvTAIL(check)) {
 699     slen = SvCUR(check); /* >= 1 */
 700
 701     if ( strend - s > slen || strend - s < slen - 1
 702      || (strend - s == slen && strend[-1] != '\n')) {
 703      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 704      goto fail_finish;
 705     }
 706     /* Now should match s[0..slen-2] */
 707     slen--;
 708     if (slen && (*SvPVX_const(check) != *s
 709        || (slen > 1
 710         && memNE(SvPVX_const(check), s, slen)))) {
 711     report_neq:
 712      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 713      goto fail_finish;
 714     }
 715    }
 716    else if (*SvPVX_const(check) != *s
 717      || ((slen = SvCUR(check)) > 1
 718       && memNE(SvPVX_const(check), s, slen)))
 719     goto report_neq;
 720    check_at = s;
 721    goto success_at_start;
 722   }
 723   }
 724   /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 725   s = strpos;
 726   start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 727   end_shift = prog->check_end_shift;
 728
 729   if (!ml_anch) {
 730    const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 731           - (SvTAIL(check) != 0);
 732    const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 733
 734    if (end_shift < eshift)
 735     end_shift = eshift;
 736   }
 737  }
 738  else {    /* Can match at random position */
 739   ml_anch = 0;
 740   s = strpos;
 741   start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 742   end_shift = prog->check_end_shift;
 743
 744   /* end shift should be non negative here */
 745  }
 746
 747 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
 748  if (end_shift < 0)
 749   Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 750     (IV)end_shift, RX_PRECOMP(prog));
 751 #endif
 752
 753   restart:
 754  /* Find a possible match in the region s..strend by looking for
 755  the "check" substring in the region corrected by start/end_shift. */
 756
 757  {
 758   I32 srch_start_shift = start_shift;
 759   I32 srch_end_shift = end_shift;
 760   U8* start_point;
 761   U8* end_point;
 762   if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 763    srch_end_shift -= ((strbeg - s) - srch_start_shift);
 764    srch_start_shift = strbeg - s;
 765   }
 766  DEBUG_OPTIMISE_MORE_r({
 767   PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 768    (IV)prog->check_offset_min,
 769    (IV)srch_start_shift,
 770    (IV)srch_end_shift,
 771    (IV)prog->check_end_shift);
 772  });
 773
 774   if (prog->extflags & RXf_CANY_SEEN) {
 775    start_point= (U8*)(s + srch_start_shift);
 776    end_point= (U8*)(strend - srch_end_shift);
 777   } else {
 778    start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 779    end_point= HOP3(strend, -srch_end_shift, strbeg);
 780   }
 781   DEBUG_OPTIMISE_MORE_r({
 782    PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 783     (int)(end_point - start_point),
 784     (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 785     start_point);
 786   });
 787
 788   s = fbm_instr( start_point, end_point,
 789      check, multiline ? FBMrf_MULTILINE : 0);
 790  }
 791  /* Update the count-of-usability, remove useless subpatterns,
 792   unshift s.  */
 793
 794  DEBUG_EXECUTE_r({
 795   RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 796    SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 797   PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 798       (s ? "Found" : "Did not find"),
 799    (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 800     ? "anchored" : "floating"),
 801    quoted,
 802    RE_SV_TAIL(check),
 803    (s ? " at offset " : "...\n") );
 804  });
 805
 806  if (!s)
 807   goto fail_finish;
 808  /* Finish the diagnostic message */
 809  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 810
 811  /* XXX dmq: first branch is for positive lookbehind...
 812  Our check string is offset from the beginning of the pattern.
 813  So we need to do any stclass tests offset forward from that
 814  point. I think. :-(
 815  */
 816
 817
 818
 819  check_at=s;
 820
 821
 822  /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 823  Start with the other substr.
 824  XXXX no SCREAM optimization yet - and a very coarse implementation
 825  XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 826     *always* match.  Probably should be marked during compile...
 827  Probably it is right to do no SCREAM here...
 828  */
 829
 830  if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 831     : (prog->float_substr && prog->anchored_substr))
 832  {
 833   /* Take into account the "other" substring. */
 834   /* XXXX May be hopelessly wrong for UTF... */
 835   if (!other_last)
 836    other_last = strpos;
 837   if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 838   do_other_anchored:
 839    {
 840     char * const last = HOP3c(s, -start_shift, strbeg);
 841     char *last1, *last2;
 842     char * const saved_s = s;
 843     SV* must;
 844
 845     t = s - prog->check_offset_max;
 846     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 847      && (!utf8_target
 848       || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 849        && t > strpos)))
 850      NOOP;
 851     else
 852      t = strpos;
 853     t = HOP3c(t, prog->anchored_offset, strend);
 854     if (t < other_last) /* These positions already checked */
 855      t = other_last;
 856     last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 857     if (last < last1)
 858      last1 = last;
 859     /* XXXX It is not documented what units *_offsets are in.
 860     We assume bytes, but this is clearly wrong.
 861     Meaning this code needs to be carefully reviewed for errors.
 862     dmq.
 863     */
 864
 865     /* On end-of-str: see comment below. */
 866     must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 867     if (must == &PL_sv_undef) {
 868      s = (char*)NULL;
 869      DEBUG_r(must = prog->anchored_utf8); /* for debug */
 870     }
 871     else
 872      s = fbm_instr(
 873       (unsigned char*)t,
 874       HOP3(HOP3(last1, prog->anchored_offset, strend)
 875         + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 876       must,
 877       multiline ? FBMrf_MULTILINE : 0
 878      );
 879     DEBUG_EXECUTE_r({
 880      RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 881       SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 882      PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 883       (s ? "Found" : "Contradicts"),
 884       quoted, RE_SV_TAIL(must));
 885     });
 886
 887
 888     if (!s) {
 889      if (last1 >= last2) {
 890       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 891             ", giving up...\n"));
 892       goto fail_finish;
 893      }
 894      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 895       ", trying floating at offset %ld...\n",
 896       (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 897      other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 898      s = HOP3c(last, 1, strend);
 899      goto restart;
 900     }
 901     else {
 902      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 903       (long)(s - i_strpos)));
 904      t = HOP3c(s, -prog->anchored_offset, strbeg);
 905      other_last = HOP3c(s, 1, strend);
 906      s = saved_s;
 907      if (t == strpos)
 908       goto try_at_start;
 909      goto try_at_offset;
 910     }
 911    }
 912   }
 913   else {  /* Take into account the floating substring. */
 914    char *last, *last1;
 915    char * const saved_s = s;
 916    SV* must;
 917
 918    t = HOP3c(s, -start_shift, strbeg);
 919    last1 = last =
 920     HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 921    if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 922     last = HOP3c(t, prog->float_max_offset, strend);
 923    s = HOP3c(t, prog->float_min_offset, strend);
 924    if (s < other_last)
 925     s = other_last;
 926  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 927    must = utf8_target ? prog->float_utf8 : prog->float_substr;
 928    /* fbm_instr() takes into account exact value of end-of-str
 929    if the check is SvTAIL(ed).  Since false positives are OK,
 930    and end-of-str is not later than strend we are OK. */
 931    if (must == &PL_sv_undef) {
 932     s = (char*)NULL;
 933     DEBUG_r(must = prog->float_utf8); /* for debug message */
 934    }
 935    else
 936     s = fbm_instr((unsigned char*)s,
 937        (unsigned char*)last + SvCUR(must)
 938         - (SvTAIL(must)!=0),
 939        must, multiline ? FBMrf_MULTILINE : 0);
 940    DEBUG_EXECUTE_r({
 941     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 942      SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 943     PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 944      (s ? "Found" : "Contradicts"),
 945      quoted, RE_SV_TAIL(must));
 946    });
 947    if (!s) {
 948     if (last1 == last) {
 949      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 950            ", giving up...\n"));
 951      goto fail_finish;
 952     }
 953     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 954      ", trying anchored starting at offset %ld...\n",
 955      (long)(saved_s + 1 - i_strpos)));
 956     other_last = last;
 957     s = HOP3c(t, 1, strend);
 958     goto restart;
 959    }
 960    else {
 961     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 962      (long)(s - i_strpos)));
 963     other_last = s; /* Fix this later. --Hugo */
 964     s = saved_s;
 965     if (t == strpos)
 966      goto try_at_start;
 967     goto try_at_offset;
 968    }
 969   }
 970  }
 971
 972
 973  t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 974
 975  DEBUG_OPTIMISE_MORE_r(
 976   PerlIO_printf(Perl_debug_log,
 977    "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 978    (IV)prog->check_offset_min,
 979    (IV)prog->check_offset_max,
 980    (IV)(s-strpos),
 981    (IV)(t-strpos),
 982    (IV)(t-s),
 983    (IV)(strend-strpos)
 984   )
 985  );
 986
 987  if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 988   && (!utf8_target
 989    || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 990     && t > strpos)))
 991  {
 992   /* Fixed substring is found far enough so that the match
 993   cannot start at strpos. */
 994  try_at_offset:
 995   if (ml_anch && t[-1] != '\n') {
 996    /* Eventually fbm_*() should handle this, but often
 997    anchored_offset is not 0, so this check will not be wasted. */
 998    /* XXXX In the code below we prefer to look for "^" even in
 999    presence of anchored substrings.  And we search even
1000    beyond the found float position.  These pessimizations
1001    are historical artefacts only.  */
1002   find_anchor:
1003    while (t < strend - prog->minlen) {
1004     if (*t == '\n') {
1005      if (t < check_at - prog->check_offset_min) {
1006       if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1007        /* Since we moved from the found position,
1008        we definitely contradict the found anchored
1009        substr.  Due to the above check we do not
1010        contradict "check" substr.
1011        Thus we can arrive here only if check substr
1012        is float.  Redo checking for "other"=="fixed".
1013        */
1014        strpos = t + 1;
1015        DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1016         PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1017        goto do_other_anchored;
1018       }
1019       /* We don't contradict the found floating substring. */
1020       /* XXXX Why not check for STCLASS? */
1021       s = t + 1;
1022       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1023        PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1024       goto set_useful;
1025      }
1026      /* Position contradicts check-string */
1027      /* XXXX probably better to look for check-string
1028      than for "\n", so one should lower the limit for t? */
1029      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1030       PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1031      other_last = strpos = s = t + 1;
1032      goto restart;
1033     }
1034     t++;
1035    }
1036    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1037       PL_colors[0], PL_colors[1]));
1038    goto fail_finish;
1039   }
1040   else {
1041    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1042       PL_colors[0], PL_colors[1]));
1043   }
1044   s = t;
1045  set_useful:
1046   ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1047  }
1048  else {
1049   /* The found string does not prohibit matching at strpos,
1050   - no optimization of calling REx engine can be performed,
1051   unless it was an MBOL and we are not after MBOL,
1052   or a future STCLASS check will fail this. */
1053  try_at_start:
1054   /* Even in this situation we may use MBOL flag if strpos is offset
1055   wrt the start of the string. */
1056   if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1057    /* May be due to an implicit anchor of m{.*foo}  */
1058    && !(prog->intflags & PREGf_IMPLICIT))
1059   {
1060    t = strpos;
1061    goto find_anchor;
1062   }
1063   DEBUG_EXECUTE_r( if (ml_anch)
1064    PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1065       (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1066   );
1067  success_at_start:
1068   if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1069    && (utf8_target ? (
1070     prog->check_utf8  /* Could be deleted already */
1071     && --BmUSEFUL(prog->check_utf8) < 0
1072     && (prog->check_utf8 == prog->float_utf8)
1073    ) : (
1074     prog->check_substr  /* Could be deleted already */
1075     && --BmUSEFUL(prog->check_substr) < 0
1076     && (prog->check_substr == prog->float_substr)
1077    )))
1078   {
1079    /* If flags & SOMETHING - do not do it many times on the same match */
1080    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1081    /* XXX Does the destruction order has to change with utf8_target? */
1082    SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1083    SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1084    prog->check_substr = prog->check_utf8 = NULL; /* disable */
1085    prog->float_substr = prog->float_utf8 = NULL; /* clear */
1086    check = NULL;   /* abort */
1087    s = strpos;
1088    /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1089      see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1090    if (prog->intflags & PREGf_IMPLICIT)
1091     prog->extflags &= ~RXf_ANCH_MBOL;
1092    /* XXXX This is a remnant of the old implementation.  It
1093      looks wasteful, since now INTUIT can use many
1094      other heuristics. */
1095    prog->extflags &= ~RXf_USE_INTUIT;
1096    /* XXXX What other flags might need to be cleared in this branch? */
1097   }
1098   else
1099    s = strpos;
1100  }
1101
1102  /* Last resort... */
1103  /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1104  /* trie stclasses are too expensive to use here, we are better off to
1105  leave it to regmatch itself */
1106  if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1107   /* minlen == 0 is possible if regstclass is \b or \B,
1108   and the fixed substr is ''$.
1109   Since minlen is already taken into account, s+1 is before strend;
1110   accidentally, minlen >= 1 guaranties no false positives at s + 1
1111   even for \b or \B.  But (minlen? 1 : 0) below assumes that
1112   regstclass does not come from lookahead...  */
1113   /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1114   This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1115   const U8* const str = (U8*)STRING(progi->regstclass);
1116   const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1117      ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1118      : 1);
1119   char * endpos;
1120   if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1121    endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1122   else if (prog->float_substr || prog->float_utf8)
1123    endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1124   else
1125    endpos= strend;
1126
1127   if (checked_upto < s)
1128   checked_upto = s;
1129   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1130          (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1131
1132   t = s;
1133   s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1134        reginfo);
1135   if (s) {
1136    checked_upto = s;
1137   } else {
1138 #ifdef DEBUGGING
1139    const char *what = NULL;
1140 #endif
1141    if (endpos == strend) {
1142     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1143         "Could not match STCLASS...\n") );
1144     goto fail;
1145    }
1146    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1147         "This position contradicts STCLASS...\n") );
1148    if ((prog->extflags & RXf_ANCH) && !ml_anch)
1149     goto fail;
1150    checked_upto = HOPBACKc(endpos, start_shift);
1151    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1152          (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1153    /* Contradict one of substrings */
1154    if (prog->anchored_substr || prog->anchored_utf8) {
1155     if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1156      DEBUG_EXECUTE_r( what = "anchored" );
1157     hop_and_restart:
1158      s = HOP3c(t, 1, strend);
1159      if (s + start_shift + end_shift > strend) {
1160       /* XXXX Should be taken into account earlier? */
1161       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1162            "Could not match STCLASS...\n") );
1163       goto fail;
1164      }
1165      if (!check)
1166       goto giveup;
1167      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1168         "Looking for %s substr starting at offset %ld...\n",
1169         what, (long)(s + start_shift - i_strpos)) );
1170      goto restart;
1171     }
1172     /* Have both, check_string is floating */
1173     if (t + start_shift >= check_at) /* Contradicts floating=check */
1174      goto retry_floating_check;
1175     /* Recheck anchored substring, but not floating... */
1176     s = check_at;
1177     if (!check)
1178      goto giveup;
1179     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1180       "Looking for anchored substr starting at offset %ld...\n",
1181       (long)(other_last - i_strpos)) );
1182     goto do_other_anchored;
1183    }
1184    /* Another way we could have checked stclass at the
1185    current position only: */
1186    if (ml_anch) {
1187     s = t = t + 1;
1188     if (!check)
1189      goto giveup;
1190     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1191       "Looking for /%s^%s/m starting at offset %ld...\n",
1192       PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1193     goto try_at_offset;
1194    }
1195    if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1196     goto fail;
1197    /* Check is floating substring. */
1198   retry_floating_check:
1199    t = check_at - start_shift;
1200    DEBUG_EXECUTE_r( what = "floating" );
1201    goto hop_and_restart;
1202   }
1203   if (t != s) {
1204    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1205       "By STCLASS: moving %ld --> %ld\n",
1206         (long)(t - i_strpos), (long)(s - i_strpos))
1207     );
1208   }
1209   else {
1210    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1211         "Does not contradict STCLASS...\n");
1212     );
1213   }
1214  }
1215   giveup:
1216  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1217       PL_colors[4], (check ? "Guessed" : "Giving up"),
1218       PL_colors[5], (long)(s - i_strpos)) );
1219  return s;
1220
1221   fail_finish:    /* Substring not found */
1222  if (prog->check_substr || prog->check_utf8)  /* could be removed already */
1223   BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1224   fail:
1225  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1226       PL_colors[4], PL_colors[5]));
1227  return NULL;
1228 }
1229
1230 #define DECL_TRIE_TYPE(scan) \
1231  const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1232      trie_type = ((scan->flags == EXACT) \
1233        ? (utf8_target ? trie_utf8 : trie_plain) \
1234        : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1235
1236 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1237 STMT_START {                               \
1238  STRLEN skiplen;                                                                 \
1239  switch (trie_type) {                                                            \
1240  case trie_utf8_fold:                                                            \
1241   if ( foldlen>0 ) {                                                          \
1242    uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1243    foldlen -= len;                                                         \
1244    uscan += len;                                                           \
1245    len=0;                                                                  \
1246   } else {                                                                    \
1247    uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1248    len = UTF8SKIP(uc);                                                     \
1249    skiplen = UNISKIP( uvc );                                               \
1250    foldlen -= skiplen;                                                     \
1251    uscan = foldbuf + skiplen;                                              \
1252   }                                                                           \
1253   break;                                                                      \
1254  case trie_latin_utf8_fold:                                                      \
1255   if ( foldlen>0 ) {                                                          \
1256    uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1257    foldlen -= len;                                                         \
1258    uscan += len;                                                           \
1259    len=0;                                                                  \
1260   } else {                                                                    \
1261    len = 1;                                                                \
1262    uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL);   \
1263    skiplen = UNISKIP( uvc );                                               \
1264    foldlen -= skiplen;                                                     \
1265    uscan = foldbuf + skiplen;                                              \
1266   }                                                                           \
1267   break;                                                                      \
1268  case trie_utf8:                                                                 \
1269   uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1270   break;                                                                      \
1271  case trie_plain:                                                                \
1272   uvc = (UV)*uc;                                                              \
1273   len = 1;                                                                    \
1274  }                                                                               \
1275  if (uvc < 256) {                                                                \
1276   charid = trie->charmap[ uvc ];                                              \
1277  }                                                                               \
1278  else {                                                                          \
1279   charid = 0;                                                                 \
1280   if (widecharmap) {                                                          \
1281    SV** const svpp = hv_fetch(widecharmap,                                 \
1282       (char*)&uvc, sizeof(UV), 0);                                \
1283    if (svpp)                                                               \
1284     charid = (U16)SvIV(*svpp);                                          \
1285   }                                                                           \
1286  }                                                                               \
1287 } STMT_END
1288
1289 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1290 STMT_START {                                              \
1291  while (s <= e) {                                      \
1292   if ( (CoNd)                                       \
1293    && (ln == 1 || folder(s, pat_string, ln))    \
1294    && (reginfo->intuit || regtry(reginfo, &s)) )\
1295    goto got_it;                                  \
1296   s++;                                              \
1297  }                                                     \
1298 } STMT_END
1299
1300 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1301 STMT_START {                                          \
1302  while (s < strend) {                              \
1303   CoDe                                          \
1304   s += UTF8SKIP(s);                             \
1305  }                                                 \
1306 } STMT_END
1307
1308 #define REXEC_FBC_SCAN(CoDe)                          \
1309 STMT_START {                                          \
1310  while (s < strend) {                              \
1311   CoDe                                          \
1312   s++;                                          \
1313  }                                                 \
1314 } STMT_END
1315
1316 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1317 REXEC_FBC_UTF8_SCAN(                                  \
1318  if (CoNd) {                                       \
1319   if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1320    goto got_it;                              \
1321   else                                          \
1322    tmp = doevery;                            \
1323  }                                                 \
1324  else                                              \
1325   tmp = 1;                                      \
1326 )
1327
1328 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1329 REXEC_FBC_SCAN(                                       \
1330  if (CoNd) {                                       \
1331   if (tmp && (reginfo->intuit || regtry(reginfo, &s)))  \
1332    goto got_it;                              \
1333   else                                          \
1334    tmp = doevery;                            \
1335  }                                                 \
1336  else                                              \
1337   tmp = 1;                                      \
1338 )
1339
1340 #define REXEC_FBC_TRYIT               \
1341 if ((reginfo->intuit || regtry(reginfo, &s))) \
1342  goto got_it
1343
1344 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1345  if (utf8_target) {                                             \
1346   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1347  }                                                          \
1348  else {                                                     \
1349   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1350  }
1351
1352 #define DUMP_EXEC_POS(li,s,doutf8) \
1353  dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1354     startpos, doutf8)
1355
1356
1357 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1358   tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1359   tmp = TEST_NON_UTF8(tmp);                                              \
1360   REXEC_FBC_UTF8_SCAN(                                                   \
1361    if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1362     tmp = !tmp;                                                    \
1363     IF_SUCCESS;                                                    \
1364    }                                                                  \
1365    else {                                                             \
1366     IF_FAIL;                                                       \
1367    }                                                                  \
1368   );                                                                     \
1369
1370 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1371   if (s == reginfo->strbeg) {                                            \
1372    tmp = '\n';                                                        \
1373   }                                                                      \
1374   else {                                                                 \
1375    U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);          \
1376    tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1377   }                                                                      \
1378   tmp = TeSt1_UtF8;                                                      \
1379   LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1380   REXEC_FBC_UTF8_SCAN(                                                   \
1381    if (tmp == ! (TeSt2_UtF8)) { \
1382     tmp = !tmp;                                                    \
1383     IF_SUCCESS;                                                    \
1384    }                                                                  \
1385    else {                                                             \
1386     IF_FAIL;                                                       \
1387    }                                                                  \
1388   );                                                                     \
1389
1390 /* The only difference between the BOUND and NBOUND cases is that
1391  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1392  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1393  * with the other one being empty */
1394 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1395  FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1396
1397 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1398  FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1399
1400 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1401  FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1402
1403 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1404  FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1405
1406
1407 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1408  * be passed in completely with the variable name being tested, which isn't
1409  * such a clean interface, but this is easier to read than it was before.  We
1410  * are looking for the boundary (or non-boundary between a word and non-word
1411  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1412  * must be different.  Find the "wordness" of the character just prior to this
1413  * one, and compare it with the wordness of this one.  If they differ, we have
1414  * a boundary.  At the beginning of the string, pretend that the previous
1415  * character was a new-line */
1416 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1417  if (utf8_target) {                                                         \
1418     UTF8_CODE \
1419  }                                                                          \
1420  else {  /* Not utf8 */                                                     \
1421   tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1422   tmp = TEST_NON_UTF8(tmp);                                              \
1423   REXEC_FBC_SCAN(                                                        \
1424    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1425     tmp = !tmp;                                                    \
1426     IF_SUCCESS;                                                    \
1427    }                                                                  \
1428    else {                                                             \
1429     IF_FAIL;                                                       \
1430    }                                                                  \
1431   );                                                                     \
1432  }                                                                          \
1433  if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s)))           \
1434   goto got_it;
1435
1436 /* We know what class REx starts with.  Try to find this position... */
1437 /* if reginfo->intuit, its a dryrun */
1438 /* annoyingly all the vars in this routine have different names from their counterparts
1439    in regmatch. /grrr */
1440
1441 STATIC char *
1442 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1443  const char *strend, regmatch_info *reginfo)
1444 {
1445  dVAR;
1446  const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1447  char *pat_string;   /* The pattern's exactish string */
1448  char *pat_end;     /* ptr to end char of pat_string */
1449  re_fold_t folder; /* Function for computing non-utf8 folds */
1450  const U8 *fold_array;   /* array for folding ords < 256 */
1451  STRLEN ln;
1452  STRLEN lnc;
1453  U8 c1;
1454  U8 c2;
1455  char *e;
1456  I32 tmp = 1; /* Scratch variable? */
1457  const bool utf8_target = reginfo->is_utf8_target;
1458  UV utf8_fold_flags = 0;
1459  const bool is_utf8_pat = reginfo->is_utf8_pat;
1460  bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1461         with a result inverts that result, as 0^1 =
1462         1 and 1^1 = 0 */
1463  _char_class_number classnum;
1464
1465  RXi_GET_DECL(prog,progi);
1466
1467  PERL_ARGS_ASSERT_FIND_BYCLASS;
1468
1469  /* We know what class it must start with. */
1470  switch (OP(c)) {
1471  case ANYOF:
1472  case ANYOF_SYNTHETIC:
1473  case ANYOF_WARN_SUPER:
1474   if (utf8_target) {
1475    REXEC_FBC_UTF8_CLASS_SCAN(
1476      reginclass(prog, c, (U8*)s, utf8_target));
1477   }
1478   else {
1479    REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1480   }
1481   break;
1482  case CANY:
1483   REXEC_FBC_SCAN(
1484    if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1485     goto got_it;
1486    else
1487     tmp = doevery;
1488   );
1489   break;
1490
1491  case EXACTFA:
1492   if (is_utf8_pat || utf8_target) {
1493    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1494    goto do_exactf_utf8;
1495   }
1496   fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1497   folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1498   goto do_exactf_non_utf8; /* isn't dealt with by these */
1499
1500  case EXACTF:
1501   if (utf8_target) {
1502
1503    /* regcomp.c already folded this if pattern is in UTF-8 */
1504    utf8_fold_flags = 0;
1505    goto do_exactf_utf8;
1506   }
1507   fold_array = PL_fold;
1508   folder = foldEQ;
1509   goto do_exactf_non_utf8;
1510
1511  case EXACTFL:
1512   if (is_utf8_pat || utf8_target) {
1513    utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1514    goto do_exactf_utf8;
1515   }
1516   fold_array = PL_fold_locale;
1517   folder = foldEQ_locale;
1518   goto do_exactf_non_utf8;
1519
1520  case EXACTFU_SS:
1521   if (is_utf8_pat) {
1522    utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1523   }
1524   goto do_exactf_utf8;
1525
1526  case EXACTFU_TRICKYFOLD:
1527  case EXACTFU:
1528   if (is_utf8_pat || utf8_target) {
1529    utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1530    goto do_exactf_utf8;
1531   }
1532
1533   /* Any 'ss' in the pattern should have been replaced by regcomp,
1534   * so we don't have to worry here about this single special case
1535   * in the Latin1 range */
1536   fold_array = PL_fold_latin1;
1537   folder = foldEQ_latin1;
1538
1539   /* FALL THROUGH */
1540
1541  do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1542       are no glitches with fold-length differences
1543       between the target string and pattern */
1544
1545   /* The idea in the non-utf8 EXACTF* cases is to first find the
1546   * first character of the EXACTF* node and then, if necessary,
1547   * case-insensitively compare the full text of the node.  c1 is the
1548   * first character.  c2 is its fold.  This logic will not work for
1549   * Unicode semantics and the german sharp ss, which hence should
1550   * not be compiled into a node that gets here. */
1551   pat_string = STRING(c);
1552   ln  = STR_LEN(c); /* length to match in octets/bytes */
1553
1554   /* We know that we have to match at least 'ln' bytes (which is the
1555   * same as characters, since not utf8).  If we have to match 3
1556   * characters, and there are only 2 availabe, we know without
1557   * trying that it will fail; so don't start a match past the
1558   * required minimum number from the far end */
1559   e = HOP3c(strend, -((I32)ln), s);
1560
1561   if (reginfo->intuit && e < s) {
1562    e = s;   /* Due to minlen logic of intuit() */
1563   }
1564
1565   c1 = *pat_string;
1566   c2 = fold_array[c1];
1567   if (c1 == c2) { /* If char and fold are the same */
1568    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1569   }
1570   else {
1571    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1572   }
1573   break;
1574
1575  do_exactf_utf8:
1576  {
1577   unsigned expansion;
1578
1579   /* If one of the operands is in utf8, we can't use the simpler folding
1580   * above, due to the fact that many different characters can have the
1581   * same fold, or portion of a fold, or different- length fold */
1582   pat_string = STRING(c);
1583   ln  = STR_LEN(c); /* length to match in octets/bytes */
1584   pat_end = pat_string + ln;
1585   lnc = is_utf8_pat       /* length to match in characters */
1586     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1587     : ln;
1588
1589   /* We have 'lnc' characters to match in the pattern, but because of
1590   * multi-character folding, each character in the target can match
1591   * up to 3 characters (Unicode guarantees it will never exceed
1592   * this) if it is utf8-encoded; and up to 2 if not (based on the
1593   * fact that the Latin 1 folds are already determined, and the
1594   * only multi-char fold in that range is the sharp-s folding to
1595   * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1596   * string character.  Adjust lnc accordingly, rounding up, so that
1597   * if we need to match at least 4+1/3 chars, that really is 5. */
1598   expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1599   lnc = (lnc + expansion - 1) / expansion;
1600
1601   /* As in the non-UTF8 case, if we have to match 3 characters, and
1602   * only 2 are left, it's guaranteed to fail, so don't start a
1603   * match that would require us to go beyond the end of the string
1604   */
1605   e = HOP3c(strend, -((I32)lnc), s);
1606
1607   if (reginfo->intuit && e < s) {
1608    e = s;   /* Due to minlen logic of intuit() */
1609   }
1610
1611   /* XXX Note that we could recalculate e to stop the loop earlier,
1612   * as the worst case expansion above will rarely be met, and as we
1613   * go along we would usually find that e moves further to the left.
1614   * This would happen only after we reached the point in the loop
1615   * where if there were no expansion we should fail.  Unclear if
1616   * worth the expense */
1617
1618   while (s <= e) {
1619    char *my_strend= (char *)strend;
1620    if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1621     pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1622     && (reginfo->intuit || regtry(reginfo, &s)) )
1623    {
1624     goto got_it;
1625    }
1626    s += (utf8_target) ? UTF8SKIP(s) : 1;
1627   }
1628   break;
1629  }
1630  case BOUNDL:
1631   RXp_MATCH_TAINTED_on(prog);
1632   FBC_BOUND(isWORDCHAR_LC,
1633     isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1634     isWORDCHAR_LC_utf8((U8*)s));
1635   break;
1636  case NBOUNDL:
1637   RXp_MATCH_TAINTED_on(prog);
1638   FBC_NBOUND(isWORDCHAR_LC,
1639     isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1640     isWORDCHAR_LC_utf8((U8*)s));
1641   break;
1642  case BOUND:
1643   FBC_BOUND(isWORDCHAR,
1644     isWORDCHAR_uni(tmp),
1645     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1646   break;
1647  case BOUNDA:
1648   FBC_BOUND_NOLOAD(isWORDCHAR_A,
1649       isWORDCHAR_A(tmp),
1650       isWORDCHAR_A((U8*)s));
1651   break;
1652  case NBOUND:
1653   FBC_NBOUND(isWORDCHAR,
1654     isWORDCHAR_uni(tmp),
1655     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1656   break;
1657  case NBOUNDA:
1658   FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1659       isWORDCHAR_A(tmp),
1660       isWORDCHAR_A((U8*)s));
1661   break;
1662  case BOUNDU:
1663   FBC_BOUND(isWORDCHAR_L1,
1664     isWORDCHAR_uni(tmp),
1665     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1666   break;
1667  case NBOUNDU:
1668   FBC_NBOUND(isWORDCHAR_L1,
1669     isWORDCHAR_uni(tmp),
1670     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1671   break;
1672  case LNBREAK:
1673   REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1674       is_LNBREAK_latin1_safe(s, strend)
1675   );
1676   break;
1677
1678  /* The argument to all the POSIX node types is the class number to pass to
1679  * _generic_isCC() to build a mask for searching in PL_charclass[] */
1680
1681  case NPOSIXL:
1682   to_complement = 1;
1683   /* FALLTHROUGH */
1684
1685  case POSIXL:
1686   RXp_MATCH_TAINTED_on(prog);
1687   REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1688       to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1689   break;
1690
1691  case NPOSIXD:
1692   to_complement = 1;
1693   /* FALLTHROUGH */
1694
1695  case POSIXD:
1696   if (utf8_target) {
1697    goto posix_utf8;
1698   }
1699   goto posixa;
1700
1701  case NPOSIXA:
1702   if (utf8_target) {
1703    /* The complement of something that matches only ASCII matches all
1704    * UTF-8 variant code points, plus everything in ASCII that isn't
1705    * in the class */
1706    REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1707          || ! _generic_isCC_A(*s, FLAGS(c)));
1708    break;
1709   }
1710
1711   to_complement = 1;
1712   /* FALLTHROUGH */
1713
1714  case POSIXA:
1715  posixa:
1716   /* Don't need to worry about utf8, as it can match only a single
1717   * byte invariant character. */
1718   REXEC_FBC_CLASS_SCAN(
1719       to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1720   break;
1721
1722  case NPOSIXU:
1723   to_complement = 1;
1724   /* FALLTHROUGH */
1725
1726  case POSIXU:
1727   if (! utf8_target) {
1728    REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1729                  FLAGS(c))));
1730   }
1731   else {
1732
1733  posix_utf8:
1734    classnum = (_char_class_number) FLAGS(c);
1735    if (classnum < _FIRST_NON_SWASH_CC) {
1736     while (s < strend) {
1737
1738      /* We avoid loading in the swash as long as possible, but
1739      * should we have to, we jump to a separate loop.  This
1740      * extra 'if' statement is what keeps this code from being
1741      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1742      if (UTF8_IS_ABOVE_LATIN1(*s)) {
1743       goto found_above_latin1;
1744      }
1745      if ((UTF8_IS_INVARIANT(*s)
1746       && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1747                 classnum)))
1748       || (UTF8_IS_DOWNGRADEABLE_START(*s)
1749        && to_complement ^ cBOOL(
1750         _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1751            classnum))))
1752      {
1753       if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1754        goto got_it;
1755       else {
1756        tmp = doevery;
1757       }
1758      }
1759      else {
1760       tmp = 1;
1761      }
1762      s += UTF8SKIP(s);
1763     }
1764    }
1765    else switch (classnum) {    /* These classes are implemented as
1766           macros */
1767     case _CC_ENUM_SPACE: /* XXX would require separate code if we
1768           revert the change of \v matching this */
1769      /* FALL THROUGH */
1770
1771     case _CC_ENUM_PSXSPC:
1772      REXEC_FBC_UTF8_CLASS_SCAN(
1773           to_complement ^ cBOOL(isSPACE_utf8(s)));
1774      break;
1775
1776     case _CC_ENUM_BLANK:
1777      REXEC_FBC_UTF8_CLASS_SCAN(
1778           to_complement ^ cBOOL(isBLANK_utf8(s)));
1779      break;
1780
1781     case _CC_ENUM_XDIGIT:
1782      REXEC_FBC_UTF8_CLASS_SCAN(
1783          to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1784      break;
1785
1786     case _CC_ENUM_VERTSPACE:
1787      REXEC_FBC_UTF8_CLASS_SCAN(
1788          to_complement ^ cBOOL(isVERTWS_utf8(s)));
1789      break;
1790
1791     case _CC_ENUM_CNTRL:
1792      REXEC_FBC_UTF8_CLASS_SCAN(
1793           to_complement ^ cBOOL(isCNTRL_utf8(s)));
1794      break;
1795
1796     default:
1797      Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1798      assert(0); /* NOTREACHED */
1799    }
1800   }
1801   break;
1802
1803  found_above_latin1:   /* Here we have to load a swash to get the result
1804        for the current code point */
1805   if (! PL_utf8_swash_ptrs[classnum]) {
1806    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1807    PL_utf8_swash_ptrs[classnum] =
1808      _core_swash_init("utf8", swash_property_names[classnum],
1809          &PL_sv_undef, 1, 0, NULL, &flags);
1810   }
1811
1812   /* This is a copy of the loop above for swash classes, though using the
1813   * FBC macro instead of being expanded out.  Since we've loaded the
1814   * swash, we don't have to check for that each time through the loop */
1815   REXEC_FBC_UTF8_CLASS_SCAN(
1816     to_complement ^ cBOOL(_generic_utf8(
1817          classnum,
1818          s,
1819          swash_fetch(PL_utf8_swash_ptrs[classnum],
1820             (U8 *) s, TRUE))));
1821   break;
1822
1823  case AHOCORASICKC:
1824  case AHOCORASICK:
1825   {
1826    DECL_TRIE_TYPE(c);
1827    /* what trie are we using right now */
1828    reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1829    reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1830    HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1831
1832    const char *last_start = strend - trie->minlen;
1833 #ifdef DEBUGGING
1834    const char *real_start = s;
1835 #endif
1836    STRLEN maxlen = trie->maxlen;
1837    SV *sv_points;
1838    U8 **points; /* map of where we were in the input string
1839        when reading a given char. For ASCII this
1840        is unnecessary overhead as the relationship
1841        is always 1:1, but for Unicode, especially
1842        case folded Unicode this is not true. */
1843    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1844    U8 *bitmap=NULL;
1845
1846
1847    GET_RE_DEBUG_FLAGS_DECL;
1848
1849    /* We can't just allocate points here. We need to wrap it in
1850    * an SV so it gets freed properly if there is a croak while
1851    * running the match */
1852    ENTER;
1853    SAVETMPS;
1854    sv_points=newSV(maxlen * sizeof(U8 *));
1855    SvCUR_set(sv_points,
1856     maxlen * sizeof(U8 *));
1857    SvPOK_on(sv_points);
1858    sv_2mortal(sv_points);
1859    points=(U8**)SvPV_nolen(sv_points );
1860    if ( trie_type != trie_utf8_fold
1861     && (trie->bitmap || OP(c)==AHOCORASICKC) )
1862    {
1863     if (trie->bitmap)
1864      bitmap=(U8*)trie->bitmap;
1865     else
1866      bitmap=(U8*)ANYOF_BITMAP(c);
1867    }
1868    /* this is the Aho-Corasick algorithm modified a touch
1869    to include special handling for long "unknown char" sequences.
1870    The basic idea being that we use AC as long as we are dealing
1871    with a possible matching char, when we encounter an unknown char
1872    (and we have not encountered an accepting state) we scan forward
1873    until we find a legal starting char.
1874    AC matching is basically that of trie matching, except that when
1875    we encounter a failing transition, we fall back to the current
1876    states "fail state", and try the current char again, a process
1877    we repeat until we reach the root state, state 1, or a legal
1878    transition. If we fail on the root state then we can either
1879    terminate if we have reached an accepting state previously, or
1880    restart the entire process from the beginning if we have not.
1881
1882    */
1883    while (s <= last_start) {
1884     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1885     U8 *uc = (U8*)s;
1886     U16 charid = 0;
1887     U32 base = 1;
1888     U32 state = 1;
1889     UV uvc = 0;
1890     STRLEN len = 0;
1891     STRLEN foldlen = 0;
1892     U8 *uscan = (U8*)NULL;
1893     U8 *leftmost = NULL;
1894 #ifdef DEBUGGING
1895     U32 accepted_word= 0;
1896 #endif
1897     U32 pointpos = 0;
1898
1899     while ( state && uc <= (U8*)strend ) {
1900      int failed=0;
1901      U32 word = aho->states[ state ].wordnum;
1902
1903      if( state==1 ) {
1904       if ( bitmap ) {
1905        DEBUG_TRIE_EXECUTE_r(
1906         if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1907          dump_exec_pos( (char *)uc, c, strend, real_start,
1908           (char *)uc, utf8_target );
1909          PerlIO_printf( Perl_debug_log,
1910           " Scanning for legal start char...\n");
1911         }
1912        );
1913        if (utf8_target) {
1914         while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1915          uc += UTF8SKIP(uc);
1916         }
1917        } else {
1918         while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1919          uc++;
1920         }
1921        }
1922        s= (char *)uc;
1923       }
1924       if (uc >(U8*)last_start) break;
1925      }
1926
1927      if ( word ) {
1928       U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1929       if (!leftmost || lpos < leftmost) {
1930        DEBUG_r(accepted_word=word);
1931        leftmost= lpos;
1932       }
1933       if (base==0) break;
1934
1935      }
1936      points[pointpos++ % maxlen]= uc;
1937      if (foldlen || uc < (U8*)strend) {
1938       REXEC_TRIE_READ_CHAR(trie_type, trie,
1939           widecharmap, uc,
1940           uscan, len, uvc, charid, foldlen,
1941           foldbuf, uniflags);
1942       DEBUG_TRIE_EXECUTE_r({
1943        dump_exec_pos( (char *)uc, c, strend,
1944           real_start, s, utf8_target);
1945        PerlIO_printf(Perl_debug_log,
1946         " Charid:%3u CP:%4"UVxf" ",
1947         charid, uvc);
1948       });
1949      }
1950      else {
1951       len = 0;
1952       charid = 0;
1953      }
1954
1955
1956      do {
1957 #ifdef DEBUGGING
1958       word = aho->states[ state ].wordnum;
1959 #endif
1960       base = aho->states[ state ].trans.base;
1961
1962       DEBUG_TRIE_EXECUTE_r({
1963        if (failed)
1964         dump_exec_pos( (char *)uc, c, strend, real_start,
1965          s,   utf8_target );
1966        PerlIO_printf( Perl_debug_log,
1967         "%sState: %4"UVxf", word=%"UVxf,
1968         failed ? " Fail transition to " : "",
1969         (UV)state, (UV)word);
1970       });
1971       if ( base ) {
1972        U32 tmp;
1973        I32 offset;
1974        if (charid &&
1975         ( ((offset = base + charid
1976          - 1 - trie->uniquecharcount)) >= 0)
1977         && ((U32)offset < trie->lasttrans)
1978         && trie->trans[offset].check == state
1979         && (tmp=trie->trans[offset].next))
1980        {
1981         DEBUG_TRIE_EXECUTE_r(
1982          PerlIO_printf( Perl_debug_log," - legal\n"));
1983         state = tmp;
1984         break;
1985        }
1986        else {
1987         DEBUG_TRIE_EXECUTE_r(
1988          PerlIO_printf( Perl_debug_log," - fail\n"));
1989         failed = 1;
1990         state = aho->fail[state];
1991        }
1992       }
1993       else {
1994        /* we must be accepting here */
1995        DEBUG_TRIE_EXECUTE_r(
1996          PerlIO_printf( Perl_debug_log," - accepting\n"));
1997        failed = 1;
1998        break;
1999       }
2000      } while(state);
2001      uc += len;
2002      if (failed) {
2003       if (leftmost)
2004        break;
2005       if (!state) state = 1;
2006      }
2007     }
2008     if ( aho->states[ state ].wordnum ) {
2009      U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2010      if (!leftmost || lpos < leftmost) {
2011       DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2012       leftmost = lpos;
2013      }
2014     }
2015     if (leftmost) {
2016      s = (char*)leftmost;
2017      DEBUG_TRIE_EXECUTE_r({
2018       PerlIO_printf(
2019        Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2020        (UV)accepted_word, (IV)(s - real_start)
2021       );
2022      });
2023      if (reginfo->intuit || regtry(reginfo, &s)) {
2024       FREETMPS;
2025       LEAVE;
2026       goto got_it;
2027      }
2028      s = HOPc(s,1);
2029      DEBUG_TRIE_EXECUTE_r({
2030       PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2031      });
2032     } else {
2033      DEBUG_TRIE_EXECUTE_r(
2034       PerlIO_printf( Perl_debug_log,"No match.\n"));
2035      break;
2036     }
2037    }
2038    FREETMPS;
2039    LEAVE;
2040   }
2041   break;
2042  default:
2043   Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2044   break;
2045  }
2046  return 0;
2047   got_it:
2048  return s;
2049 }
2050
2051
2052 /*
2053  - regexec_flags - match a regexp against a string
2054  */
2055 I32
2056 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2057    char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2058 /* stringarg: the point in the string at which to begin matching */
2059 /* strend:    pointer to null at end of string */
2060 /* strbeg:    real beginning of string */
2061 /* minend:    end of match must be >= minend bytes after stringarg. */
2062 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2063  *            itself is accessed via the pointers above */
2064 /* data:      May be used for some additional optimizations.
2065    Currently its only used, with a U32 cast, for transmitting
2066    the ganch offset when doing a /g match. This will change */
2067 /* nosave:    For optimizations. */
2068
2069 {
2070  dVAR;
2071  struct regexp *const prog = ReANY(rx);
2072  char *s;
2073  regnode *c;
2074  char *startpos = stringarg;
2075  I32 minlen;  /* must match at least this many chars */
2076  I32 dontbother = 0; /* how many characters not to try at end */
2077  I32 end_shift = 0;   /* Same for the end. */  /* CC */
2078  I32 scream_pos = -1;  /* Internal iterator of scream. */
2079  char *scream_olds = NULL;
2080  const bool utf8_target = cBOOL(DO_UTF8(sv));
2081  I32 multiline;
2082  RXi_GET_DECL(prog,progi);
2083  regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2084  regmatch_info *const reginfo = &reginfo_buf;
2085  regexp_paren_pair *swap = NULL;
2086  I32 oldsave;
2087  GET_RE_DEBUG_FLAGS_DECL;
2088
2089  PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2090  PERL_UNUSED_ARG(data);
2091
2092  /* Be paranoid... */
2093  if (prog == NULL || startpos == NULL) {
2094   Perl_croak(aTHX_ "NULL regexp parameter");
2095   return 0;
2096  }
2097
2098  DEBUG_EXECUTE_r(
2099   debug_start_match(rx, utf8_target, startpos, strend,
2100   "Matching");
2101  );
2102
2103
2104  /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2105  * which will call destuctors to reset PL_regmatch_state, free higher
2106  * PL_regmatch_slabs, and clean up regmatch_info_aux and
2107  * regmatch_info_aux_eval */
2108
2109  oldsave = PL_savestack_ix;
2110
2111  multiline = prog->extflags & RXf_PMf_MULTILINE;
2112  minlen = prog->minlen;
2113
2114  if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2115   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2116        "String too short [regexec_flags]...\n"));
2117   goto phooey;
2118  }
2119
2120  /* Check validity of program. */
2121  if (UCHARAT(progi->program) != REG_MAGIC) {
2122   Perl_croak(aTHX_ "corrupted regexp program");
2123  }
2124
2125  RX_MATCH_TAINTED_off(rx);
2126
2127  reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2128  reginfo->intuit = 0;
2129  reginfo->is_utf8_target = cBOOL(utf8_target);
2130  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2131  reginfo->warned = FALSE;
2132  reginfo->strbeg  = strbeg;
2133  reginfo->sv = sv;
2134  reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2135  reginfo->strend = strend;
2136  /* see how far we have to get to not match where we matched before */
2137  reginfo->till = startpos+minend;
2138
2139  /* reserve next 2 or 3 slots in PL_regmatch_state:
2140  * slot N+0: may currently be in use: skip it
2141  * slot N+1: use for regmatch_info_aux struct
2142  * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2143  * slot N+3: ready for use by regmatch()
2144  */
2145
2146  {
2147   regmatch_state *old_regmatch_state;
2148   regmatch_slab  *old_regmatch_slab;
2149   int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2150
2151   /* on first ever match, allocate first slab */
2152   if (!PL_regmatch_slab) {
2153    Newx(PL_regmatch_slab, 1, regmatch_slab);
2154    PL_regmatch_slab->prev = NULL;
2155    PL_regmatch_slab->next = NULL;
2156    PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2157   }
2158
2159   old_regmatch_state = PL_regmatch_state;
2160   old_regmatch_slab  = PL_regmatch_slab;
2161
2162   for (i=0; i <= max; i++) {
2163    if (i == 1)
2164     reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2165    else if (i ==2)
2166     reginfo->info_aux_eval =
2167     reginfo->info_aux->info_aux_eval =
2168        &(PL_regmatch_state->u.info_aux_eval);
2169
2170    if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2171     PL_regmatch_state = S_push_slab(aTHX);
2172   }
2173
2174   /* note initial PL_regmatch_state position; at end of match we'll
2175   * pop back to there and free any higher slabs */
2176
2177   reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2178   reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2179   reginfo->info_aux->poscache = NULL;
2180
2181   SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2182
2183   if ((prog->extflags & RXf_EVAL_SEEN))
2184    S_setup_eval_state(aTHX_ reginfo);
2185   else
2186    reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2187  }
2188
2189  /* If there is a "must appear" string, look for it. */
2190  s = startpos;
2191
2192  if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2193   MAGIC *mg;
2194   if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2195    reginfo->ganch = startpos + prog->gofs;
2196    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2197    "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2198   } else if (sv && (mg = mg_find_mglob(sv))
2199     && mg->mg_len >= 0) {
2200    reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */
2201    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2202     "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2203
2204    if (prog->extflags & RXf_ANCH_GPOS) {
2205     if (s > reginfo->ganch)
2206      goto phooey;
2207     s = reginfo->ganch - prog->gofs;
2208     DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2209      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2210     if (s < strbeg)
2211      goto phooey;
2212    }
2213   }
2214   else if (data) {
2215    reginfo->ganch = strbeg + PTR2UV(data);
2216    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2217     "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2218
2219   } else {    /* pos() not defined */
2220    reginfo->ganch = strbeg;
2221    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2222     "GPOS: reginfo->ganch = strbeg\n"));
2223   }
2224  }
2225  if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2226   /* We have to be careful. If the previous successful match
2227   was from this regex we don't want a subsequent partially
2228   successful match to clobber the old results.
2229   So when we detect this possibility we add a swap buffer
2230   to the re, and switch the buffer each match. If we fail,
2231   we switch it back; otherwise we leave it swapped.
2232   */
2233   swap = prog->offs;
2234   /* do we need a save destructor here for eval dies? */
2235   Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2236   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2237    "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2238    PTR2UV(prog),
2239    PTR2UV(swap),
2240    PTR2UV(prog->offs)
2241   ));
2242  }
2243  if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2244   re_scream_pos_data d;
2245
2246   d.scream_olds = &scream_olds;
2247   d.scream_pos = &scream_pos;
2248   s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d);
2249   if (!s) {
2250    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2251    goto phooey; /* not present */
2252   }
2253  }
2254
2255
2256
2257  /* Simplest case:  anchored match need be tried only once. */
2258  /*  [unless only anchor is BOL and multiline is set] */
2259  if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2260   if (s == startpos && regtry(reginfo, &startpos))
2261    goto got_it;
2262   else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2263     || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2264   {
2265    char *end;
2266
2267    if (minlen)
2268     dontbother = minlen - 1;
2269    end = HOP3c(strend, -dontbother, strbeg) - 1;
2270    /* for multiline we only have to try after newlines */
2271    if (prog->check_substr || prog->check_utf8) {
2272     /* because of the goto we can not easily reuse the macros for bifurcating the
2273     unicode/non-unicode match modes here like we do elsewhere - demerphq */
2274     if (utf8_target) {
2275      if (s == startpos)
2276       goto after_try_utf8;
2277      while (1) {
2278       if (regtry(reginfo, &s)) {
2279        goto got_it;
2280       }
2281      after_try_utf8:
2282       if (s > end) {
2283        goto phooey;
2284       }
2285       if (prog->extflags & RXf_USE_INTUIT) {
2286        s = re_intuit_start(rx, sv, strbeg,
2287          s + UTF8SKIP(s), strend, flags, NULL);
2288        if (!s) {
2289         goto phooey;
2290        }
2291       }
2292       else {
2293        s += UTF8SKIP(s);
2294       }
2295      }
2296     } /* end search for check string in unicode */
2297     else {
2298      if (s == startpos) {
2299       goto after_try_latin;
2300      }
2301      while (1) {
2302       if (regtry(reginfo, &s)) {
2303        goto got_it;
2304       }
2305      after_try_latin:
2306       if (s > end) {
2307        goto phooey;
2308       }
2309       if (prog->extflags & RXf_USE_INTUIT) {
2310        s = re_intuit_start(rx, sv, strbeg,
2311           s + 1, strend, flags, NULL);
2312        if (!s) {
2313         goto phooey;
2314        }
2315       }
2316       else {
2317        s++;
2318       }
2319      }
2320     } /* end search for check string in latin*/
2321    } /* end search for check string */
2322    else { /* search for newline */
2323     if (s > startpos) {
2324      /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2325      s--;
2326     }
2327     /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2328     while (s <= end) { /* note it could be possible to match at the end of the string */
2329      if (*s++ == '\n') { /* don't need PL_utf8skip here */
2330       if (regtry(reginfo, &s))
2331        goto got_it;
2332      }
2333     }
2334    } /* end search for newline */
2335   } /* end anchored/multiline check string search */
2336   goto phooey;
2337  } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2338  {
2339   /* the warning about reginfo->ganch being used without initialization
2340   is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2341   and we only enter this block when the same bit is set. */
2342   char *tmp_s = reginfo->ganch - prog->gofs;
2343
2344   if (tmp_s >= strbeg && regtry(reginfo, &tmp_s))
2345    goto got_it;
2346   goto phooey;
2347  }
2348
2349  /* Messy cases:  unanchored match. */
2350  if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2351   /* we have /x+whatever/ */
2352   /* it must be a one character string (XXXX Except is_utf8_pat?) */
2353   char ch;
2354 #ifdef DEBUGGING
2355   int did_match = 0;
2356 #endif
2357   if (utf8_target) {
2358    if (! prog->anchored_utf8) {
2359     to_utf8_substr(prog);
2360    }
2361    ch = SvPVX_const(prog->anchored_utf8)[0];
2362    REXEC_FBC_SCAN(
2363     if (*s == ch) {
2364      DEBUG_EXECUTE_r( did_match = 1 );
2365      if (regtry(reginfo, &s)) goto got_it;
2366      s += UTF8SKIP(s);
2367      while (s < strend && *s == ch)
2368       s += UTF8SKIP(s);
2369     }
2370    );
2371
2372   }
2373   else {
2374    if (! prog->anchored_substr) {
2375     if (! to_byte_substr(prog)) {
2376      NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2377     }
2378    }
2379    ch = SvPVX_const(prog->anchored_substr)[0];
2380    REXEC_FBC_SCAN(
2381     if (*s == ch) {
2382      DEBUG_EXECUTE_r( did_match = 1 );
2383      if (regtry(reginfo, &s)) goto got_it;
2384      s++;
2385      while (s < strend && *s == ch)
2386       s++;
2387     }
2388    );
2389   }
2390   DEBUG_EXECUTE_r(if (!did_match)
2391     PerlIO_printf(Perl_debug_log,
2392         "Did not find anchored character...\n")
2393    );
2394  }
2395  else if (prog->anchored_substr != NULL
2396    || prog->anchored_utf8 != NULL
2397    || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2398     && prog->float_max_offset < strend - s)) {
2399   SV *must;
2400   I32 back_max;
2401   I32 back_min;
2402   char *last;
2403   char *last1;  /* Last position checked before */
2404 #ifdef DEBUGGING
2405   int did_match = 0;
2406 #endif
2407   if (prog->anchored_substr || prog->anchored_utf8) {
2408    if (utf8_target) {
2409     if (! prog->anchored_utf8) {
2410      to_utf8_substr(prog);
2411     }
2412     must = prog->anchored_utf8;
2413    }
2414    else {
2415     if (! prog->anchored_substr) {
2416      if (! to_byte_substr(prog)) {
2417       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2418      }
2419     }
2420     must = prog->anchored_substr;
2421    }
2422    back_max = back_min = prog->anchored_offset;
2423   } else {
2424    if (utf8_target) {
2425     if (! prog->float_utf8) {
2426      to_utf8_substr(prog);
2427     }
2428     must = prog->float_utf8;
2429    }
2430    else {
2431     if (! prog->float_substr) {
2432      if (! to_byte_substr(prog)) {
2433       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2434      }
2435     }
2436     must = prog->float_substr;
2437    }
2438    back_max = prog->float_max_offset;
2439    back_min = prog->float_min_offset;
2440   }
2441
2442   if (back_min<0) {
2443    last = strend;
2444   } else {
2445    last = HOP3c(strend, /* Cannot start after this */
2446     -(I32)(CHR_SVLEN(must)
2447       - (SvTAIL(must) != 0) + back_min), strbeg);
2448   }
2449   if (s > reginfo->strbeg)
2450    last1 = HOPc(s, -1);
2451   else
2452    last1 = s - 1; /* bogus */
2453
2454   /* XXXX check_substr already used to find "s", can optimize if
2455   check_substr==must. */
2456   scream_pos = -1;
2457   dontbother = end_shift;
2458   strend = HOPc(strend, -dontbother);
2459   while ( (s <= last) &&
2460     (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2461         (unsigned char*)strend, must,
2462         multiline ? FBMrf_MULTILINE : 0)) ) {
2463    DEBUG_EXECUTE_r( did_match = 1 );
2464    if (HOPc(s, -back_max) > last1) {
2465     last1 = HOPc(s, -back_min);
2466     s = HOPc(s, -back_max);
2467    }
2468    else {
2469     char * const t = (last1 >= reginfo->strbeg)
2470          ? HOPc(last1, 1) : last1 + 1;
2471
2472     last1 = HOPc(s, -back_min);
2473     s = t;
2474    }
2475    if (utf8_target) {
2476     while (s <= last1) {
2477      if (regtry(reginfo, &s))
2478       goto got_it;
2479      if (s >= last1) {
2480       s++; /* to break out of outer loop */
2481       break;
2482      }
2483      s += UTF8SKIP(s);
2484     }
2485    }
2486    else {
2487     while (s <= last1) {
2488      if (regtry(reginfo, &s))
2489       goto got_it;
2490      s++;
2491     }
2492    }
2493   }
2494   DEBUG_EXECUTE_r(if (!did_match) {
2495    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2496     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2497    PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2498        ((must == prog->anchored_substr || must == prog->anchored_utf8)
2499        ? "anchored" : "floating"),
2500     quoted, RE_SV_TAIL(must));
2501   });
2502   goto phooey;
2503  }
2504  else if ( (c = progi->regstclass) ) {
2505   if (minlen) {
2506    const OPCODE op = OP(progi->regstclass);
2507    /* don't bother with what can't match */
2508    if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2509     strend = HOPc(strend, -(minlen - 1));
2510   }
2511   DEBUG_EXECUTE_r({
2512    SV * const prop = sv_newmortal();
2513    regprop(prog, prop, c);
2514    {
2515     RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2516      s,strend-s,60);
2517     PerlIO_printf(Perl_debug_log,
2518      "Matching stclass %.*s against %s (%d bytes)\n",
2519      (int)SvCUR(prop), SvPVX_const(prop),
2520      quoted, (int)(strend - s));
2521    }
2522   });
2523   if (find_byclass(prog, c, s, strend, reginfo))
2524    goto got_it;
2525   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2526  }
2527  else {
2528   dontbother = 0;
2529   if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2530    /* Trim the end. */
2531    char *last= NULL;
2532    SV* float_real;
2533    STRLEN len;
2534    const char *little;
2535
2536    if (utf8_target) {
2537     if (! prog->float_utf8) {
2538      to_utf8_substr(prog);
2539     }
2540     float_real = prog->float_utf8;
2541    }
2542    else {
2543     if (! prog->float_substr) {
2544      if (! to_byte_substr(prog)) {
2545       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2546      }
2547     }
2548     float_real = prog->float_substr;
2549    }
2550
2551    little = SvPV_const(float_real, len);
2552    if (SvTAIL(float_real)) {
2553      /* This means that float_real contains an artificial \n on
2554      * the end due to the presence of something like this:
2555      * /foo$/ where we can match both "foo" and "foo\n" at the
2556      * end of the string.  So we have to compare the end of the
2557      * string first against the float_real without the \n and
2558      * then against the full float_real with the string.  We
2559      * have to watch out for cases where the string might be
2560      * smaller than the float_real or the float_real without
2561      * the \n. */
2562      char *checkpos= strend - len;
2563      DEBUG_OPTIMISE_r(
2564       PerlIO_printf(Perl_debug_log,
2565        "%sChecking for float_real.%s\n",
2566        PL_colors[4], PL_colors[5]));
2567      if (checkpos + 1 < strbeg) {
2568       /* can't match, even if we remove the trailing \n
2569       * string is too short to match */
2570       DEBUG_EXECUTE_r(
2571        PerlIO_printf(Perl_debug_log,
2572         "%sString shorter than required trailing substring, cannot match.%s\n",
2573         PL_colors[4], PL_colors[5]));
2574       goto phooey;
2575      } else if (memEQ(checkpos + 1, little, len - 1)) {
2576       /* can match, the end of the string matches without the
2577       * "\n" */
2578       last = checkpos + 1;
2579      } else if (checkpos < strbeg) {
2580       /* cant match, string is too short when the "\n" is
2581       * included */
2582       DEBUG_EXECUTE_r(
2583        PerlIO_printf(Perl_debug_log,
2584         "%sString does not contain required trailing substring, cannot match.%s\n",
2585         PL_colors[4], PL_colors[5]));
2586       goto phooey;
2587      } else if (!multiline) {
2588       /* non multiline match, so compare with the "\n" at the
2589       * end of the string */
2590       if (memEQ(checkpos, little, len)) {
2591        last= checkpos;
2592       } else {
2593        DEBUG_EXECUTE_r(
2594         PerlIO_printf(Perl_debug_log,
2595          "%sString does not contain required trailing substring, cannot match.%s\n",
2596          PL_colors[4], PL_colors[5]));
2597        goto phooey;
2598       }
2599      } else {
2600       /* multiline match, so we have to search for a place
2601       * where the full string is located */
2602       goto find_last;
2603      }
2604    } else {
2605     find_last:
2606      if (len)
2607       last = rninstr(s, strend, little, little + len);
2608      else
2609       last = strend; /* matching "$" */
2610    }
2611    if (!last) {
2612     /* at one point this block contained a comment which was
2613     * probably incorrect, which said that this was a "should not
2614     * happen" case.  Even if it was true when it was written I am
2615     * pretty sure it is not anymore, so I have removed the comment
2616     * and replaced it with this one. Yves */
2617     DEBUG_EXECUTE_r(
2618      PerlIO_printf(Perl_debug_log,
2619       "String does not contain required substring, cannot match.\n"
2620      ));
2621     goto phooey;
2622    }
2623    dontbother = strend - last + prog->float_min_offset;
2624   }
2625   if (minlen && (dontbother < minlen))
2626    dontbother = minlen - 1;
2627   strend -= dontbother;      /* this one's always in bytes! */
2628   /* We don't know much -- general case. */
2629   if (utf8_target) {
2630    for (;;) {
2631     if (regtry(reginfo, &s))
2632      goto got_it;
2633     if (s >= strend)
2634      break;
2635     s += UTF8SKIP(s);
2636    };
2637   }
2638   else {
2639    do {
2640     if (regtry(reginfo, &s))
2641      goto got_it;
2642    } while (s++ < strend);
2643   }
2644  }
2645
2646  /* Failure. */
2647  goto phooey;
2648
2649 got_it:
2650  DEBUG_BUFFERS_r(
2651   if (swap)
2652    PerlIO_printf(Perl_debug_log,
2653     "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2654     PTR2UV(prog),
2655     PTR2UV(swap)
2656    );
2657  );
2658  Safefree(swap);
2659
2660  /* clean up; this will trigger destructors that will free all slabs
2661  * above the current one, and cleanup the regmatch_info_aux
2662  * and regmatch_info_aux_eval sructs */
2663
2664  LEAVE_SCOPE(oldsave);
2665
2666  if (RXp_PAREN_NAMES(prog))
2667   (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2668
2669  RX_MATCH_UTF8_set(rx, utf8_target);
2670
2671  /* make sure $`, $&, $', and $digit will work later */
2672  if ( !(flags & REXEC_NOT_FIRST) ) {
2673   if (flags & REXEC_COPY_STR) {
2674 #ifdef PERL_ANY_COW
2675    if (SvCANCOW(sv)) {
2676     if (DEBUG_C_TEST) {
2677      PerlIO_printf(Perl_debug_log,
2678         "Copy on write: regexp capture, type %d\n",
2679         (int) SvTYPE(sv));
2680     }
2681     RX_MATCH_COPY_FREE(rx);
2682     prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2683     prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2684     assert (SvPOKp(prog->saved_copy));
2685     prog->sublen  = reginfo->strend - strbeg;
2686     prog->suboffset = 0;
2687     prog->subcoffset = 0;
2688    } else
2689 #endif
2690    {
2691     I32 min = 0;
2692     I32 max = reginfo->strend - strbeg;
2693     I32 sublen;
2694
2695     if (    (flags & REXEC_COPY_SKIP_POST)
2696      && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2697      && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2698     ) { /* don't copy $' part of string */
2699      U32 n = 0;
2700      max = -1;
2701      /* calculate the right-most part of the string covered
2702      * by a capture. Due to look-ahead, this may be to
2703      * the right of $&, so we have to scan all captures */
2704      while (n <= prog->lastparen) {
2705       if (prog->offs[n].end > max)
2706        max = prog->offs[n].end;
2707       n++;
2708      }
2709      if (max == -1)
2710       max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2711         ? prog->offs[0].start
2712         : 0;
2713      assert(max >= 0 && max <= reginfo->strend - strbeg);
2714     }
2715
2716     if (    (flags & REXEC_COPY_SKIP_PRE)
2717      && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2718      && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2719     ) { /* don't copy $` part of string */
2720      U32 n = 0;
2721      min = max;
2722      /* calculate the left-most part of the string covered
2723      * by a capture. Due to look-behind, this may be to
2724      * the left of $&, so we have to scan all captures */
2725      while (min && n <= prog->lastparen) {
2726       if (   prog->offs[n].start != -1
2727        && prog->offs[n].start < min)
2728       {
2729        min = prog->offs[n].start;
2730       }
2731       n++;
2732      }
2733      if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2734       && min >  prog->offs[0].end
2735      )
2736       min = prog->offs[0].end;
2737
2738     }
2739
2740     assert(min >= 0 && min <= max
2741      && min <= reginfo->strend - strbeg);
2742     sublen = max - min;
2743
2744     if (RX_MATCH_COPIED(rx)) {
2745      if (sublen > prog->sublen)
2746       prog->subbeg =
2747         (char*)saferealloc(prog->subbeg, sublen+1);
2748     }
2749     else
2750      prog->subbeg = (char*)safemalloc(sublen+1);
2751     Copy(strbeg + min, prog->subbeg, sublen, char);
2752     prog->subbeg[sublen] = '\0';
2753     prog->suboffset = min;
2754     prog->sublen = sublen;
2755     RX_MATCH_COPIED_on(rx);
2756    }
2757    prog->subcoffset = prog->suboffset;
2758    if (prog->suboffset && utf8_target) {
2759     /* Convert byte offset to chars.
2760     * XXX ideally should only compute this if @-/@+
2761     * has been seen, a la PL_sawampersand ??? */
2762
2763     /* If there's a direct correspondence between the
2764     * string which we're matching and the original SV,
2765     * then we can use the utf8 len cache associated with
2766     * the SV. In particular, it means that under //g,
2767     * sv_pos_b2u() will use the previously cached
2768     * position to speed up working out the new length of
2769     * subcoffset, rather than counting from the start of
2770     * the string each time. This stops
2771     *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2772     * from going quadratic */
2773     if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2774      sv_pos_b2u(sv, &(prog->subcoffset));
2775     else
2776      prog->subcoffset = utf8_length((U8*)strbeg,
2777           (U8*)(strbeg+prog->suboffset));
2778    }
2779   }
2780   else {
2781    RX_MATCH_COPY_FREE(rx);
2782    prog->subbeg = strbeg;
2783    prog->suboffset = 0;
2784    prog->subcoffset = 0;
2785    /* use reginfo->strend, as strend may have been modified */
2786    prog->sublen = reginfo->strend - strbeg;
2787   }
2788  }
2789
2790  return 1;
2791
2792 phooey:
2793  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2794       PL_colors[4], PL_colors[5]));
2795
2796  /* clean up; this will trigger destructors that will free all slabs
2797  * above the current one, and cleanup the regmatch_info_aux
2798  * and regmatch_info_aux_eval sructs */
2799
2800  LEAVE_SCOPE(oldsave);
2801
2802  if (swap) {
2803   /* we failed :-( roll it back */
2804   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2805    "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2806    PTR2UV(prog),
2807    PTR2UV(prog->offs),
2808    PTR2UV(swap)
2809   ));
2810   Safefree(prog->offs);
2811   prog->offs = swap;
2812  }
2813  return 0;
2814 }
2815
2816
2817 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
2818  * Do inc before dec, in case old and new rex are the same */
2819 #define SET_reg_curpm(Re2) \
2820  if (reginfo->info_aux_eval) {                   \
2821   (void)ReREFCNT_inc(Re2);      \
2822   ReREFCNT_dec(PM_GETRE(PL_reg_curpm));     \
2823   PM_SETRE((PL_reg_curpm), (Re2));     \
2824  }
2825
2826
2827 /*
2828  - regtry - try match at specific point
2829  */
2830 STATIC I32   /* 0 failure, 1 success */
2831 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2832 {
2833  dVAR;
2834  CHECKPOINT lastcp;
2835  REGEXP *const rx = reginfo->prog;
2836  regexp *const prog = ReANY(rx);
2837  I32 result;
2838  RXi_GET_DECL(prog,progi);
2839  GET_RE_DEBUG_FLAGS_DECL;
2840
2841  PERL_ARGS_ASSERT_REGTRY;
2842
2843  reginfo->cutpoint=NULL;
2844
2845  prog->offs[0].start = *startposp - reginfo->strbeg;
2846  prog->lastparen = 0;
2847  prog->lastcloseparen = 0;
2848
2849  /* XXXX What this code is doing here?!!!  There should be no need
2850  to do this again and again, prog->lastparen should take care of
2851  this!  --ilya*/
2852
2853  /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2854  * Actually, the code in regcppop() (which Ilya may be meaning by
2855  * prog->lastparen), is not needed at all by the test suite
2856  * (op/regexp, op/pat, op/split), but that code is needed otherwise
2857  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2858  * Meanwhile, this code *is* needed for the
2859  * above-mentioned test suite tests to succeed.  The common theme
2860  * on those tests seems to be returning null fields from matches.
2861  * --jhi updated by dapm */
2862 #if 1
2863  if (prog->nparens) {
2864   regexp_paren_pair *pp = prog->offs;
2865   I32 i;
2866   for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2867    ++pp;
2868    pp->start = -1;
2869    pp->end = -1;
2870   }
2871  }
2872 #endif
2873  REGCP_SET(lastcp);
2874  result = regmatch(reginfo, *startposp, progi->program + 1);
2875  if (result != -1) {
2876   prog->offs[0].end = result;
2877   return 1;
2878  }
2879  if (reginfo->cutpoint)
2880   *startposp= reginfo->cutpoint;
2881  REGCP_UNWIND(lastcp);
2882  return 0;
2883 }
2884
2885
2886 #define sayYES goto yes
2887 #define sayNO goto no
2888 #define sayNO_SILENT goto no_silent
2889
2890 /* we dont use STMT_START/END here because it leads to
2891    "unreachable code" warnings, which are bogus, but distracting. */
2892 #define CACHEsayNO \
2893  if (ST.cache_mask) \
2894  reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
2895  sayNO
2896
2897 /* this is used to determine how far from the left messages like
2898    'failed...' are printed. It should be set such that messages
2899    are inline with the regop output that created them.
2900 */
2901 #define REPORT_CODE_OFF 32
2902
2903
2904 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2905 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2906 #define CHRTEST_NOT_A_CP_1 -999
2907 #define CHRTEST_NOT_A_CP_2 -998
2908
2909 /* grab a new slab and return the first slot in it */
2910
2911 STATIC regmatch_state *
2912 S_push_slab(pTHX)
2913 {
2914 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2915  dMY_CXT;
2916 #endif
2917  regmatch_slab *s = PL_regmatch_slab->next;
2918  if (!s) {
2919   Newx(s, 1, regmatch_slab);
2920   s->prev = PL_regmatch_slab;
2921   s->next = NULL;
2922   PL_regmatch_slab->next = s;
2923  }
2924  PL_regmatch_slab = s;
2925  return SLAB_FIRST(s);
2926 }
2927
2928
2929 /* push a new state then goto it */
2930
2931 #define PUSH_STATE_GOTO(state, node, input) \
2932  pushinput = input; \
2933  scan = node; \
2934  st->resume_state = state; \
2935  goto push_state;
2936
2937 /* push a new state with success backtracking, then goto it */
2938
2939 #define PUSH_YES_STATE_GOTO(state, node, input) \
2940  pushinput = input; \
2941  scan = node; \
2942  st->resume_state = state; \
2943  goto push_yes_state;
2944
2945
2946
2947
2948 /*
2949
2950 regmatch() - main matching routine
2951
2952 This is basically one big switch statement in a loop. We execute an op,
2953 set 'next' to point the next op, and continue. If we come to a point which
2954 we may need to backtrack to on failure such as (A|B|C), we push a
2955 backtrack state onto the backtrack stack. On failure, we pop the top
2956 state, and re-enter the loop at the state indicated. If there are no more
2957 states to pop, we return failure.
2958
2959 Sometimes we also need to backtrack on success; for example /A+/, where
2960 after successfully matching one A, we need to go back and try to
2961 match another one; similarly for lookahead assertions: if the assertion
2962 completes successfully, we backtrack to the state just before the assertion
2963 and then carry on.  In these cases, the pushed state is marked as
2964 'backtrack on success too'. This marking is in fact done by a chain of
2965 pointers, each pointing to the previous 'yes' state. On success, we pop to
2966 the nearest yes state, discarding any intermediate failure-only states.
2967 Sometimes a yes state is pushed just to force some cleanup code to be
2968 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2969 it to free the inner regex.
2970
2971 Note that failure backtracking rewinds the cursor position, while
2972 success backtracking leaves it alone.
2973
2974 A pattern is complete when the END op is executed, while a subpattern
2975 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2976 ops trigger the "pop to last yes state if any, otherwise return true"
2977 behaviour.
2978
2979 A common convention in this function is to use A and B to refer to the two
2980 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2981 the subpattern to be matched possibly multiple times, while B is the entire
2982 rest of the pattern. Variable and state names reflect this convention.
2983
2984 The states in the main switch are the union of ops and failure/success of
2985 substates associated with with that op.  For example, IFMATCH is the op
2986 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2987 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2988 successfully matched A and IFMATCH_A_fail is a state saying that we have
2989 just failed to match A. Resume states always come in pairs. The backtrack
2990 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2991 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2992 on success or failure.
2993
2994 The struct that holds a backtracking state is actually a big union, with
2995 one variant for each major type of op. The variable st points to the
2996 top-most backtrack struct. To make the code clearer, within each
2997 block of code we #define ST to alias the relevant union.
2998
2999 Here's a concrete example of a (vastly oversimplified) IFMATCH
3000 implementation:
3001
3002  switch (state) {
3003  ....
3004
3005 #define ST st->u.ifmatch
3006
3007  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3008   ST.foo = ...; // some state we wish to save
3009   ...
3010   // push a yes backtrack state with a resume value of
3011   // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3012   // first node of A:
3013   PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3014   // NOTREACHED
3015
3016  case IFMATCH_A: // we have successfully executed A; now continue with B
3017   next = B;
3018   bar = ST.foo; // do something with the preserved value
3019   break;
3020
3021  case IFMATCH_A_fail: // A failed, so the assertion failed
3022   ...;   // do some housekeeping, then ...
3023   sayNO; // propagate the failure
3024
3025 #undef ST
3026
3027  ...
3028  }
3029
3030 For any old-timers reading this who are familiar with the old recursive
3031 approach, the code above is equivalent to:
3032
3033  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3034  {
3035   int foo = ...
3036   ...
3037   if (regmatch(A)) {
3038    next = B;
3039    bar = foo;
3040    break;
3041   }
3042   ...;   // do some housekeeping, then ...
3043   sayNO; // propagate the failure
3044  }
3045
3046 The topmost backtrack state, pointed to by st, is usually free. If you
3047 want to claim it, populate any ST.foo fields in it with values you wish to
3048 save, then do one of
3049
3050   PUSH_STATE_GOTO(resume_state, node, newinput);
3051   PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3052
3053 which sets that backtrack state's resume value to 'resume_state', pushes a
3054 new free entry to the top of the backtrack stack, then goes to 'node'.
3055 On backtracking, the free slot is popped, and the saved state becomes the
3056 new free state. An ST.foo field in this new top state can be temporarily
3057 accessed to retrieve values, but once the main loop is re-entered, it
3058 becomes available for reuse.
3059
3060 Note that the depth of the backtrack stack constantly increases during the
3061 left-to-right execution of the pattern, rather than going up and down with
3062 the pattern nesting. For example the stack is at its maximum at Z at the
3063 end of the pattern, rather than at X in the following:
3064
3065  /(((X)+)+)+....(Y)+....Z/
3066
3067 The only exceptions to this are lookahead/behind assertions and the cut,
3068 (?>A), which pop all the backtrack states associated with A before
3069 continuing.
3070
3071 Backtrack state structs are allocated in slabs of about 4K in size.
3072 PL_regmatch_state and st always point to the currently active state,
3073 and PL_regmatch_slab points to the slab currently containing
3074 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3075 allocated, and is never freed until interpreter destruction. When the slab
3076 is full, a new one is allocated and chained to the end. At exit from
3077 regmatch(), slabs allocated since entry are freed.
3078
3079 */
3080
3081
3082 #define DEBUG_STATE_pp(pp)        \
3083  DEBUG_STATE_r({         \
3084   DUMP_EXEC_POS(locinput, scan, utf8_target);      \
3085   PerlIO_printf(Perl_debug_log,       \
3086    "    %*s"pp" %s%s%s%s%s\n",       \
3087    depth*2, "",        \
3088    PL_reg_name[st->resume_state],                     \
3089    ((st==yes_state||st==mark_state) ? "[" : ""),   \
3090    ((st==yes_state) ? "Y" : ""),                   \
3091    ((st==mark_state) ? "M" : ""),                  \
3092    ((st==yes_state||st==mark_state) ? "]" : "")    \
3093   );                                                  \
3094  });
3095
3096
3097 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3098
3099 #ifdef DEBUGGING
3100
3101 STATIC void
3102 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3103  const char *start, const char *end, const char *blurb)
3104 {
3105  const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3106
3107  PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3108
3109  if (!PL_colorset)
3110    reginitcolors();
3111  {
3112   RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3113    RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3114
3115   RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3116    start, end - start, 60);
3117
3118   PerlIO_printf(Perl_debug_log,
3119    "%s%s REx%s %s against %s\n",
3120      PL_colors[4], blurb, PL_colors[5], s0, s1);
3121
3122   if (utf8_target||utf8_pat)
3123    PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3124     utf8_pat ? "pattern" : "",
3125     utf8_pat && utf8_target ? " and " : "",
3126     utf8_target ? "string" : ""
3127    );
3128  }
3129 }
3130
3131 STATIC void
3132 S_dump_exec_pos(pTHX_ const char *locinput,
3133      const regnode *scan,
3134      const char *loc_regeol,
3135      const char *loc_bostr,
3136      const char *loc_reg_starttry,
3137      const bool utf8_target)
3138 {
3139  const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3140  const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3141  int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3142  /* The part of the string before starttry has one color
3143  (pref0_len chars), between starttry and current
3144  position another one (pref_len - pref0_len chars),
3145  after the current position the third one.
3146  We assume that pref0_len <= pref_len, otherwise we
3147  decrease pref0_len.  */
3148  int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3149   ? (5 + taill) - l : locinput - loc_bostr;
3150  int pref0_len;
3151
3152  PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3153
3154  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3155   pref_len++;
3156  pref0_len = pref_len  - (locinput - loc_reg_starttry);
3157  if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3158   l = ( loc_regeol - locinput > (5 + taill) - pref_len
3159    ? (5 + taill) - pref_len : loc_regeol - locinput);
3160  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3161   l--;
3162  if (pref0_len < 0)
3163   pref0_len = 0;
3164  if (pref0_len > pref_len)
3165   pref0_len = pref_len;
3166  {
3167   const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3168
3169   RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3170    (locinput - pref_len),pref0_len, 60, 4, 5);
3171
3172   RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3173      (locinput - pref_len + pref0_len),
3174      pref_len - pref0_len, 60, 2, 3);
3175
3176   RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3177      locinput, loc_regeol - locinput, 10, 0, 1);
3178
3179   const STRLEN tlen=len0+len1+len2;
3180   PerlIO_printf(Perl_debug_log,
3181      "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3182      (IV)(locinput - loc_bostr),
3183      len0, s0,
3184      len1, s1,
3185      (docolor ? "" : "> <"),
3186      len2, s2,
3187      (int)(tlen > 19 ? 0 :  19 - tlen),
3188      "");
3189  }
3190 }
3191
3192 #endif
3193
3194 /* reg_check_named_buff_matched()
3195  * Checks to see if a named buffer has matched. The data array of
3196  * buffer numbers corresponding to the buffer is expected to reside
3197  * in the regexp->data->data array in the slot stored in the ARG() of
3198  * node involved. Note that this routine doesn't actually care about the
3199  * name, that information is not preserved from compilation to execution.
3200  * Returns the index of the leftmost defined buffer with the given name
3201  * or 0 if non of the buffers matched.
3202  */
3203 STATIC I32
3204 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3205 {
3206  I32 n;
3207  RXi_GET_DECL(rex,rexi);
3208  SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3209  I32 *nums=(I32*)SvPVX(sv_dat);
3210
3211  PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3212
3213  for ( n=0; n<SvIVX(sv_dat); n++ ) {
3214   if ((I32)rex->lastparen >= nums[n] &&
3215    rex->offs[nums[n]].end != -1)
3216   {
3217    return nums[n];
3218   }
3219  }
3220  return 0;
3221 }
3222
3223
3224 static bool
3225 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3226   U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3227 {
3228  /* This function determines if there are one or two characters that match
3229  * the first character of the passed-in EXACTish node <text_node>, and if
3230  * so, returns them in the passed-in pointers.
3231  *
3232  * If it determines that no possible character in the target string can
3233  * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3234  * the first character in <text_node> requires UTF-8 to represent, and the
3235  * target string isn't in UTF-8.)
3236  *
3237  * If there are more than two characters that could match the beginning of
3238  * <text_node>, or if more context is required to determine a match or not,
3239  * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3240  *
3241  * The motiviation behind this function is to allow the caller to set up
3242  * tight loops for matching.  If <text_node> is of type EXACT, there is
3243  * only one possible character that can match its first character, and so
3244  * the situation is quite simple.  But things get much more complicated if
3245  * folding is involved.  It may be that the first character of an EXACTFish
3246  * node doesn't participate in any possible fold, e.g., punctuation, so it
3247  * can be matched only by itself.  The vast majority of characters that are
3248  * in folds match just two things, their lower and upper-case equivalents.
3249  * But not all are like that; some have multiple possible matches, or match
3250  * sequences of more than one character.  This function sorts all that out.
3251  *
3252  * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3253  * loop of trying to match A*, we know we can't exit where the thing
3254  * following it isn't a B.  And something can't be a B unless it is the
3255  * beginning of B.  By putting a quick test for that beginning in a tight
3256  * loop, we can rule out things that can't possibly be B without having to
3257  * break out of the loop, thus avoiding work.  Similarly, if A is a single
3258  * character, we can make a tight loop matching A*, using the outputs of
3259  * this function.
3260  *
3261  * If the target string to match isn't in UTF-8, and there aren't
3262  * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3263  * the one or two possible octets (which are characters in this situation)
3264  * that can match.  In all cases, if there is only one character that can
3265  * match, *<c1p> and *<c2p> will be identical.
3266  *
3267  * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3268  * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3269  * can match the beginning of <text_node>.  They should be declared with at
3270  * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3271  * undefined what these contain.)  If one or both of the buffers are
3272  * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3273  * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3274  * *<c2p> will be set to a negative number(s) that shouldn't match any code
3275  * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3276  * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3277
3278  const bool utf8_target = reginfo->is_utf8_target;
3279
3280  UV c1 = CHRTEST_NOT_A_CP_1;
3281  UV c2 = CHRTEST_NOT_A_CP_2;
3282  bool use_chrtest_void = FALSE;
3283  const bool is_utf8_pat = reginfo->is_utf8_pat;
3284
3285  /* Used when we have both utf8 input and utf8 output, to avoid converting
3286  * to/from code points */
3287  bool utf8_has_been_setup = FALSE;
3288
3289  dVAR;
3290
3291  U8 *pat = (U8*)STRING(text_node);
3292
3293  if (OP(text_node) == EXACT) {
3294
3295   /* In an exact node, only one thing can be matched, that first
3296   * character.  If both the pat and the target are UTF-8, we can just
3297   * copy the input to the output, avoiding finding the code point of
3298   * that character */
3299   if (!is_utf8_pat) {
3300    c2 = c1 = *pat;
3301   }
3302   else if (utf8_target) {
3303    Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3304    Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3305    utf8_has_been_setup = TRUE;
3306   }
3307   else {
3308    c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3309   }
3310  }
3311  else /* an EXACTFish node */
3312   if ((is_utf8_pat
3313      && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3314              pat + STR_LEN(text_node)))
3315    || (!is_utf8_pat
3316      && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3317              pat + STR_LEN(text_node))))
3318  {
3319   /* Multi-character folds require more context to sort out.  Also
3320   * PL_utf8_foldclosures used below doesn't handle them, so have to be
3321   * handled outside this routine */
3322   use_chrtest_void = TRUE;
3323  }
3324  else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3325   c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3326   if (c1 > 256) {
3327    /* Load the folds hash, if not already done */
3328    SV** listp;
3329    if (! PL_utf8_foldclosures) {
3330     if (! PL_utf8_tofold) {
3331      U8 dummy[UTF8_MAXBYTES+1];
3332
3333      /* Force loading this by folding an above-Latin1 char */
3334      to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3335      assert(PL_utf8_tofold); /* Verify that worked */
3336     }
3337     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3338    }
3339
3340    /* The fold closures data structure is a hash with the keys being
3341    * the UTF-8 of every character that is folded to, like 'k', and
3342    * the values each an array of all code points that fold to its
3343    * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3344    * not included */
3345    if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3346          (char *) pat,
3347          UTF8SKIP(pat),
3348          FALSE))))
3349    {
3350     /* Not found in the hash, therefore there are no folds
3351     * containing it, so there is only a single character that
3352     * could match */
3353     c2 = c1;
3354    }
3355    else {  /* Does participate in folds */
3356     AV* list = (AV*) *listp;
3357     if (av_len(list) != 1) {
3358
3359      /* If there aren't exactly two folds to this, it is outside
3360      * the scope of this function */
3361      use_chrtest_void = TRUE;
3362     }
3363     else {  /* There are two.  Get them */
3364      SV** c_p = av_fetch(list, 0, FALSE);
3365      if (c_p == NULL) {
3366       Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3367      }
3368      c1 = SvUV(*c_p);
3369
3370      c_p = av_fetch(list, 1, FALSE);
3371      if (c_p == NULL) {
3372       Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3373      }
3374      c2 = SvUV(*c_p);
3375
3376      /* Folds that cross the 255/256 boundary are forbidden if
3377      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3378      * pattern character is above 256, and its only other match
3379      * is below 256, the only legal match will be to itself.
3380      * We have thrown away the original, so have to compute
3381      * which is the one above 255 */
3382      if ((c1 < 256) != (c2 < 256)) {
3383       if (OP(text_node) == EXACTFL
3384        || (OP(text_node) == EXACTFA
3385         && (isASCII(c1) || isASCII(c2))))
3386       {
3387        if (c1 < 256) {
3388         c1 = c2;
3389        }
3390        else {
3391         c2 = c1;
3392        }
3393       }
3394      }
3395     }
3396    }
3397   }
3398   else /* Here, c1 is < 255 */
3399    if (utf8_target
3400     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3401     && OP(text_node) != EXACTFL
3402     && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3403   {
3404    /* Here, there could be something above Latin1 in the target which
3405    * folds to this character in the pattern.  All such cases except
3406    * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3407    * involved in their folds, so are outside the scope of this
3408    * function */
3409    if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3410     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3411    }
3412    else {
3413     use_chrtest_void = TRUE;
3414    }
3415   }
3416   else { /* Here nothing above Latin1 can fold to the pattern character */
3417    switch (OP(text_node)) {
3418
3419     case EXACTFL:   /* /l rules */
3420      c2 = PL_fold_locale[c1];
3421      break;
3422
3423     case EXACTF:
3424      if (! utf8_target) {    /* /d rules */
3425       c2 = PL_fold[c1];
3426       break;
3427      }
3428      /* FALLTHROUGH */
3429      /* /u rules for all these.  This happens to work for
3430      * EXACTFA as nothing in Latin1 folds to ASCII */
3431     case EXACTFA:
3432     case EXACTFU_TRICKYFOLD:
3433     case EXACTFU_SS:
3434     case EXACTFU:
3435      c2 = PL_fold_latin1[c1];
3436      break;
3437
3438     default:
3439      Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3440      assert(0); /* NOTREACHED */
3441    }
3442   }
3443  }
3444
3445  /* Here have figured things out.  Set up the returns */
3446  if (use_chrtest_void) {
3447   *c2p = *c1p = CHRTEST_VOID;
3448  }
3449  else if (utf8_target) {
3450   if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3451    uvchr_to_utf8(c1_utf8, c1);
3452    uvchr_to_utf8(c2_utf8, c2);
3453   }
3454
3455   /* Invariants are stored in both the utf8 and byte outputs; Use
3456   * negative numbers otherwise for the byte ones.  Make sure that the
3457   * byte ones are the same iff the utf8 ones are the same */
3458   *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3459   *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3460     ? *c2_utf8
3461     : (c1 == c2)
3462     ? CHRTEST_NOT_A_CP_1
3463     : CHRTEST_NOT_A_CP_2;
3464  }
3465  else if (c1 > 255) {
3466  if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3467       can represent */
3468   return FALSE;
3469  }
3470
3471  *c1p = *c2p = c2;    /* c2 is the only representable value */
3472  }
3473  else {  /* c1 is representable; see about c2 */
3474  *c1p = c1;
3475  *c2p = (c2 < 256) ? c2 : c1;
3476  }
3477
3478  return TRUE;
3479 }
3480
3481 /* returns -1 on failure, $+[0] on success */
3482 STATIC I32
3483 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3484 {
3485 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3486  dMY_CXT;
3487 #endif
3488  dVAR;
3489  const bool utf8_target = reginfo->is_utf8_target;
3490  const U32 uniflags = UTF8_ALLOW_DEFAULT;
3491  REGEXP *rex_sv = reginfo->prog;
3492  regexp *rex = ReANY(rex_sv);
3493  RXi_GET_DECL(rex,rexi);
3494  /* the current state. This is a cached copy of PL_regmatch_state */
3495  regmatch_state *st;
3496  /* cache heavy used fields of st in registers */
3497  regnode *scan;
3498  regnode *next;
3499  U32 n = 0; /* general value; init to avoid compiler warning */
3500  I32 ln = 0; /* len or last;  init to avoid compiler warning */
3501  char *locinput = startpos;
3502  char *pushinput; /* where to continue after a PUSH */
3503  I32 nextchr;   /* is always set to UCHARAT(locinput) */
3504
3505  bool result = 0;     /* return value of S_regmatch */
3506  int depth = 0;     /* depth of backtrack stack */
3507  U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3508  const U32 max_nochange_depth =
3509   (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3510   3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3511  regmatch_state *yes_state = NULL; /* state to pop to on success of
3512                subpattern */
3513  /* mark_state piggy backs on the yes_state logic so that when we unwind
3514  the stack on success we can update the mark_state as we go */
3515  regmatch_state *mark_state = NULL; /* last mark state we have seen */
3516  regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3517  struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3518  U32 state_num;
3519  bool no_final = 0;      /* prevent failure from backtracking? */
3520  bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3521  char *startpoint = locinput;
3522  SV *popmark = NULL;     /* are we looking for a mark? */
3523  SV *sv_commit = NULL;   /* last mark name seen in failure */
3524  SV *sv_yes_mark = NULL; /* last mark name we have seen
3525        during a successful match */
3526  U32 lastopen = 0;       /* last open we saw */
3527  bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3528  SV* const oreplsv = GvSV(PL_replgv);
3529  /* these three flags are set by various ops to signal information to
3530  * the very next op. They have a useful lifetime of exactly one loop
3531  * iteration, and are not preserved or restored by state pushes/pops
3532  */
3533  bool sw = 0;     /* the condition value in (?(cond)a|b) */
3534  bool minmod = 0;     /* the next "{n,m}" is a "{n,m}?" */
3535  int logical = 0;     /* the following EVAL is:
3536         0: (?{...})
3537         1: (?(?{...})X|Y)
3538         2: (??{...})
3539        or the following IFMATCH/UNLESSM is:
3540         false: plain (?=foo)
3541         true:  used as a condition: (?(?=foo))
3542        */
3543  PAD* last_pad = NULL;
3544  dMULTICALL;
3545  I32 gimme = G_SCALAR;
3546  CV *caller_cv = NULL; /* who called us */
3547  CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3548  CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3549  U32 maxopenparen = 0;       /* max '(' index seen so far */
3550  int to_complement;  /* Invert the result? */
3551  _char_class_number classnum;
3552  bool is_utf8_pat = reginfo->is_utf8_pat;
3553
3554 #ifdef DEBUGGING
3555  GET_RE_DEBUG_FLAGS_DECL;
3556 #endif
3557
3558  /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3559  multicall_oldcatch = 0;
3560  multicall_cv = NULL;
3561  cx = NULL;
3562  PERL_UNUSED_VAR(multicall_cop);
3563  PERL_UNUSED_VAR(newsp);
3564
3565
3566  PERL_ARGS_ASSERT_REGMATCH;
3567
3568  DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3569    PerlIO_printf(Perl_debug_log,"regmatch start\n");
3570  }));
3571
3572  st = PL_regmatch_state;
3573
3574  /* Note that nextchr is a byte even in UTF */
3575  SET_nextchr;
3576  scan = prog;
3577  while (scan != NULL) {
3578
3579   DEBUG_EXECUTE_r( {
3580    SV * const prop = sv_newmortal();
3581    regnode *rnext=regnext(scan);
3582    DUMP_EXEC_POS( locinput, scan, utf8_target );
3583    regprop(rex, prop, scan);
3584
3585    PerlIO_printf(Perl_debug_log,
3586      "%3"IVdf":%*s%s(%"IVdf")\n",
3587      (IV)(scan - rexi->program), depth*2, "",
3588      SvPVX_const(prop),
3589      (PL_regkind[OP(scan)] == END || !rnext) ?
3590       0 : (IV)(rnext - rexi->program));
3591   });
3592
3593   next = scan + NEXT_OFF(scan);
3594   if (next == scan)
3595    next = NULL;
3596   state_num = OP(scan);
3597
3598   REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3599  reenter_switch:
3600   to_complement = 0;
3601
3602   SET_nextchr;
3603   assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3604
3605   switch (state_num) {
3606   case BOL: /*  /^../  */
3607    if (locinput == reginfo->strbeg)
3608     break;
3609    sayNO;
3610
3611   case MBOL: /*  /^../m  */
3612    if (locinput == reginfo->strbeg ||
3613     (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3614    {
3615     break;
3616    }
3617    sayNO;
3618
3619   case SBOL: /*  /^../s  */
3620    if (locinput == reginfo->strbeg)
3621     break;
3622    sayNO;
3623
3624   case GPOS: /*  \G  */
3625    if (locinput == reginfo->ganch)
3626     break;
3627    sayNO;
3628
3629   case KEEPS: /*   \K  */
3630    /* update the startpoint */
3631    st->u.keeper.val = rex->offs[0].start;
3632    rex->offs[0].start = locinput - reginfo->strbeg;
3633    PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3634    assert(0); /*NOTREACHED*/
3635   case KEEPS_next_fail:
3636    /* rollback the start point change */
3637    rex->offs[0].start = st->u.keeper.val;
3638    sayNO_SILENT;
3639    assert(0); /*NOTREACHED*/
3640
3641   case EOL: /* /..$/  */
3642     goto seol;
3643
3644   case MEOL: /* /..$/m  */
3645    if (!NEXTCHR_IS_EOS && nextchr != '\n')
3646     sayNO;
3647    break;
3648
3649   case SEOL: /* /..$/s  */
3650   seol:
3651    if (!NEXTCHR_IS_EOS && nextchr != '\n')
3652     sayNO;
3653    if (reginfo->strend - locinput > 1)
3654     sayNO;
3655    break;
3656
3657   case EOS: /*  \z  */
3658    if (!NEXTCHR_IS_EOS)
3659     sayNO;
3660    break;
3661
3662   case SANY: /*  /./s  */
3663    if (NEXTCHR_IS_EOS)
3664     sayNO;
3665    goto increment_locinput;
3666
3667   case CANY: /*  \C  */
3668    if (NEXTCHR_IS_EOS)
3669     sayNO;
3670    locinput++;
3671    break;
3672
3673   case REG_ANY: /*  /./  */
3674    if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3675     sayNO;
3676    goto increment_locinput;
3677
3678
3679 #undef  ST
3680 #define ST st->u.trie
3681   case TRIEC: /* (ab|cd) with known charclass */
3682    /* In this case the charclass data is available inline so
3683    we can fail fast without a lot of extra overhead.
3684    */
3685    if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3686     DEBUG_EXECUTE_r(
3687      PerlIO_printf(Perl_debug_log,
3688        "%*s  %sfailed to match trie start class...%s\n",
3689        REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3690     );
3691     sayNO_SILENT;
3692     assert(0); /* NOTREACHED */
3693    }
3694    /* FALL THROUGH */
3695   case TRIE:  /* (ab|cd)  */
3696    /* the basic plan of execution of the trie is:
3697    * At the beginning, run though all the states, and
3698    * find the longest-matching word. Also remember the position
3699    * of the shortest matching word. For example, this pattern:
3700    *    1  2 3 4    5
3701    *    ab|a|x|abcd|abc
3702    * when matched against the string "abcde", will generate
3703    * accept states for all words except 3, with the longest
3704    * matching word being 4, and the shortest being 2 (with
3705    * the position being after char 1 of the string).
3706    *
3707    * Then for each matching word, in word order (i.e. 1,2,4,5),
3708    * we run the remainder of the pattern; on each try setting
3709    * the current position to the character following the word,
3710    * returning to try the next word on failure.
3711    *
3712    * We avoid having to build a list of words at runtime by
3713    * using a compile-time structure, wordinfo[].prev, which
3714    * gives, for each word, the previous accepting word (if any).
3715    * In the case above it would contain the mappings 1->2, 2->0,
3716    * 3->0, 4->5, 5->1.  We can use this table to generate, from
3717    * the longest word (4 above), a list of all words, by
3718    * following the list of prev pointers; this gives us the
3719    * unordered list 4,5,1,2. Then given the current word we have
3720    * just tried, we can go through the list and find the
3721    * next-biggest word to try (so if we just failed on word 2,
3722    * the next in the list is 4).
3723    *
3724    * Since at runtime we don't record the matching position in
3725    * the string for each word, we have to work that out for
3726    * each word we're about to process. The wordinfo table holds
3727    * the character length of each word; given that we recorded
3728    * at the start: the position of the shortest word and its
3729    * length in chars, we just need to move the pointer the
3730    * difference between the two char lengths. Depending on
3731    * Unicode status and folding, that's cheap or expensive.
3732    *
3733    * This algorithm is optimised for the case where are only a
3734    * small number of accept states, i.e. 0,1, or maybe 2.
3735    * With lots of accepts states, and having to try all of them,
3736    * it becomes quadratic on number of accept states to find all
3737    * the next words.
3738    */
3739
3740    {
3741     /* what type of TRIE am I? (utf8 makes this contextual) */
3742     DECL_TRIE_TYPE(scan);
3743
3744     /* what trie are we using right now */
3745     reg_trie_data * const trie
3746      = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3747     HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3748     U32 state = trie->startstate;
3749
3750     if (   trie->bitmap
3751      && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3752     {
3753      if (trie->states[ state ].wordnum) {
3754       DEBUG_EXECUTE_r(
3755        PerlIO_printf(Perl_debug_log,
3756           "%*s  %smatched empty string...%s\n",
3757           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3758       );
3759       if (!trie->jump)
3760        break;
3761      } else {
3762       DEBUG_EXECUTE_r(
3763        PerlIO_printf(Perl_debug_log,
3764           "%*s  %sfailed to match trie start class...%s\n",
3765           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3766       );
3767       sayNO_SILENT;
3768     }
3769     }
3770
3771    {
3772     U8 *uc = ( U8* )locinput;
3773
3774     STRLEN len = 0;
3775     STRLEN foldlen = 0;
3776     U8 *uscan = (U8*)NULL;
3777     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3778     U32 charcount = 0; /* how many input chars we have matched */
3779     U32 accepted = 0; /* have we seen any accepting states? */
3780
3781     ST.jump = trie->jump;
3782     ST.me = scan;
3783     ST.firstpos = NULL;
3784     ST.longfold = FALSE; /* char longer if folded => it's harder */
3785     ST.nextword = 0;
3786
3787     /* fully traverse the TRIE; note the position of the
3788     shortest accept state and the wordnum of the longest
3789     accept state */
3790
3791     while ( state && uc <= (U8*)(reginfo->strend) ) {
3792      U32 base = trie->states[ state ].trans.base;
3793      UV uvc = 0;
3794      U16 charid = 0;
3795      U16 wordnum;
3796      wordnum = trie->states[ state ].wordnum;
3797
3798      if (wordnum) { /* it's an accept state */
3799       if (!accepted) {
3800        accepted = 1;
3801        /* record first match position */
3802        if (ST.longfold) {
3803         ST.firstpos = (U8*)locinput;
3804         ST.firstchars = 0;
3805        }
3806        else {
3807         ST.firstpos = uc;
3808         ST.firstchars = charcount;
3809        }
3810       }
3811       if (!ST.nextword || wordnum < ST.nextword)
3812        ST.nextword = wordnum;
3813       ST.topword = wordnum;
3814      }
3815
3816      DEBUG_TRIE_EXECUTE_r({
3817         DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3818         PerlIO_printf( Perl_debug_log,
3819          "%*s  %sState: %4"UVxf" Accepted: %c ",
3820          2+depth * 2, "", PL_colors[4],
3821          (UV)state, (accepted ? 'Y' : 'N'));
3822      });
3823
3824      /* read a char and goto next state */
3825      if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3826       I32 offset;
3827       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3828            uscan, len, uvc, charid, foldlen,
3829            foldbuf, uniflags);
3830       charcount++;
3831       if (foldlen>0)
3832        ST.longfold = TRUE;
3833       if (charid &&
3834        ( ((offset =
3835        base + charid - 1 - trie->uniquecharcount)) >= 0)
3836
3837        && ((U32)offset < trie->lasttrans)
3838        && trie->trans[offset].check == state)
3839       {
3840        state = trie->trans[offset].next;
3841       }
3842       else {
3843        state = 0;
3844       }
3845       uc += len;
3846
3847      }
3848      else {
3849       state = 0;
3850      }
3851      DEBUG_TRIE_EXECUTE_r(
3852       PerlIO_printf( Perl_debug_log,
3853        "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3854        charid, uvc, (UV)state, PL_colors[5] );
3855      );
3856     }
3857     if (!accepted)
3858     sayNO;
3859
3860     /* calculate total number of accept states */
3861     {
3862      U16 w = ST.topword;
3863      accepted = 0;
3864      while (w) {
3865       w = trie->wordinfo[w].prev;
3866       accepted++;
3867      }
3868      ST.accepted = accepted;
3869     }
3870
3871     DEBUG_EXECUTE_r(
3872      PerlIO_printf( Perl_debug_log,
3873       "%*s  %sgot %"IVdf" possible matches%s\n",
3874       REPORT_CODE_OFF + depth * 2, "",
3875       PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3876     );
3877     goto trie_first_try; /* jump into the fail handler */
3878    }}
3879    assert(0); /* NOTREACHED */
3880
3881   case TRIE_next_fail: /* we failed - try next alternative */
3882   {
3883    U8 *uc;
3884    if ( ST.jump) {
3885     REGCP_UNWIND(ST.cp);
3886     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3887    }
3888    if (!--ST.accepted) {
3889     DEBUG_EXECUTE_r({
3890      PerlIO_printf( Perl_debug_log,
3891       "%*s  %sTRIE failed...%s\n",
3892       REPORT_CODE_OFF+depth*2, "",
3893       PL_colors[4],
3894       PL_colors[5] );
3895     });
3896     sayNO_SILENT;
3897    }
3898    {
3899     /* Find next-highest word to process.  Note that this code
3900     * is O(N^2) per trie run (O(N) per branch), so keep tight */
3901     U16 min = 0;
3902     U16 word;
3903     U16 const nextword = ST.nextword;
3904     reg_trie_wordinfo * const wordinfo
3905      = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3906     for (word=ST.topword; word; word=wordinfo[word].prev) {
3907      if (word > nextword && (!min || word < min))
3908       min = word;
3909     }
3910     ST.nextword = min;
3911    }
3912
3913   trie_first_try:
3914    if (do_cutgroup) {
3915     do_cutgroup = 0;
3916     no_final = 0;
3917    }
3918
3919    if ( ST.jump) {
3920     ST.lastparen = rex->lastparen;
3921     ST.lastcloseparen = rex->lastcloseparen;
3922     REGCP_SET(ST.cp);
3923    }
3924
3925    /* find start char of end of current word */
3926    {
3927     U32 chars; /* how many chars to skip */
3928     reg_trie_data * const trie
3929      = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3930
3931     assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3932        >=  ST.firstchars);
3933     chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3934        - ST.firstchars;
3935     uc = ST.firstpos;
3936
3937     if (ST.longfold) {
3938      /* the hard option - fold each char in turn and find
3939      * its folded length (which may be different */
3940      U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3941      STRLEN foldlen;
3942      STRLEN len;
3943      UV uvc;
3944      U8 *uscan;
3945
3946      while (chars) {
3947       if (utf8_target) {
3948        uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3949              uniflags);
3950        uc += len;
3951       }
3952       else {
3953        uvc = *uc;
3954        uc++;
3955       }
3956       uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3957       uscan = foldbuf;
3958       while (foldlen) {
3959        if (!--chars)
3960         break;
3961        uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3962            uniflags);
3963        uscan += len;
3964        foldlen -= len;
3965       }
3966      }
3967     }
3968     else {
3969      if (utf8_target)
3970       while (chars--)
3971        uc += UTF8SKIP(uc);
3972      else
3973       uc += chars;
3974     }
3975    }
3976
3977    scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3978        ? ST.jump[ST.nextword]
3979        : NEXT_OFF(ST.me));
3980
3981    DEBUG_EXECUTE_r({
3982     PerlIO_printf( Perl_debug_log,
3983      "%*s  %sTRIE matched word #%d, continuing%s\n",
3984      REPORT_CODE_OFF+depth*2, "",
3985      PL_colors[4],
3986      ST.nextword,
3987      PL_colors[5]
3988      );
3989    });
3990
3991    if (ST.accepted > 1 || has_cutgroup) {
3992     PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
3993     assert(0); /* NOTREACHED */
3994    }
3995    /* only one choice left - just continue */
3996    DEBUG_EXECUTE_r({
3997     AV *const trie_words
3998      = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3999     SV ** const tmp = av_fetch( trie_words,
4000      ST.nextword-1, 0 );
4001     SV *sv= tmp ? sv_newmortal() : NULL;
4002
4003     PerlIO_printf( Perl_debug_log,
4004      "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4005      REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4006      ST.nextword,
4007      tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4008        PL_colors[0], PL_colors[1],
4009        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4010       )
4011      : "not compiled under -Dr",
4012      PL_colors[5] );
4013    });
4014
4015    locinput = (char*)uc;
4016    continue; /* execute rest of RE */
4017    assert(0); /* NOTREACHED */
4018   }
4019 #undef  ST
4020
4021   case EXACT: {            /*  /abc/        */
4022    char *s = STRING(scan);
4023    ln = STR_LEN(scan);
4024    if (utf8_target != is_utf8_pat) {
4025     /* The target and the pattern have differing utf8ness. */
4026     char *l = locinput;
4027     const char * const e = s + ln;
4028
4029     if (utf8_target) {
4030      /* The target is utf8, the pattern is not utf8.
4031      * Above-Latin1 code points can't match the pattern;
4032      * invariants match exactly, and the other Latin1 ones need
4033      * to be downgraded to a single byte in order to do the
4034      * comparison.  (If we could be confident that the target
4035      * is not malformed, this could be refactored to have fewer
4036      * tests by just assuming that if the first bytes match, it
4037      * is an invariant, but there are tests in the test suite
4038      * dealing with (??{...}) which violate this) */
4039      while (s < e) {
4040       if (l >= reginfo->strend
4041        || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4042       {
4043        sayNO;
4044       }
4045       if (UTF8_IS_INVARIANT(*(U8*)l)) {
4046        if (*l != *s) {
4047         sayNO;
4048        }
4049        l++;
4050       }
4051       else {
4052        if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4053         sayNO;
4054        }
4055        l += 2;
4056       }
4057       s++;
4058      }
4059     }
4060     else {
4061      /* The target is not utf8, the pattern is utf8. */
4062      while (s < e) {
4063       if (l >= reginfo->strend
4064        || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4065       {
4066        sayNO;
4067       }
4068       if (UTF8_IS_INVARIANT(*(U8*)s)) {
4069        if (*s != *l) {
4070         sayNO;
4071        }
4072        s++;
4073       }
4074       else {
4075        if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4076         sayNO;
4077        }
4078        s += 2;
4079       }
4080       l++;
4081      }
4082     }
4083     locinput = l;
4084    }
4085    else {
4086     /* The target and the pattern have the same utf8ness. */
4087     /* Inline the first character, for speed. */
4088     if (reginfo->strend - locinput < ln
4089      || UCHARAT(s) != nextchr
4090      || (ln > 1 && memNE(s, locinput, ln)))
4091     {
4092      sayNO;
4093     }
4094     locinput += ln;
4095    }
4096    break;
4097    }
4098
4099   case EXACTFL: {          /*  /abc/il      */
4100    re_fold_t folder;
4101    const U8 * fold_array;
4102    const char * s;
4103    U32 fold_utf8_flags;
4104
4105    RX_MATCH_TAINTED_on(reginfo->prog);
4106    folder = foldEQ_locale;
4107    fold_array = PL_fold_locale;
4108    fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4109    goto do_exactf;
4110
4111   case EXACTFU_SS:         /*  /\x{df}/iu   */
4112   case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4113   case EXACTFU:            /*  /abc/iu      */
4114    folder = foldEQ_latin1;
4115    fold_array = PL_fold_latin1;
4116    fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4117    goto do_exactf;
4118
4119   case EXACTFA:            /*  /abc/iaa     */
4120    folder = foldEQ_latin1;
4121    fold_array = PL_fold_latin1;
4122    fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4123    goto do_exactf;
4124
4125   case EXACTF:             /*  /abc/i       */
4126    folder = foldEQ;
4127    fold_array = PL_fold;
4128    fold_utf8_flags = 0;
4129
4130   do_exactf:
4131    s = STRING(scan);
4132    ln = STR_LEN(scan);
4133
4134    if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4135    /* Either target or the pattern are utf8, or has the issue where
4136    * the fold lengths may differ. */
4137     const char * const l = locinput;
4138     char *e = reginfo->strend;
4139
4140     if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4141           l, &e, 0,  utf8_target, fold_utf8_flags))
4142     {
4143      sayNO;
4144     }
4145     locinput = e;
4146     break;
4147    }
4148
4149    /* Neither the target nor the pattern are utf8 */
4150    if (UCHARAT(s) != nextchr
4151     && !NEXTCHR_IS_EOS
4152     && UCHARAT(s) != fold_array[nextchr])
4153    {
4154     sayNO;
4155    }
4156    if (reginfo->strend - locinput < ln)
4157     sayNO;
4158    if (ln > 1 && ! folder(s, locinput, ln))
4159     sayNO;
4160    locinput += ln;
4161    break;
4162   }
4163
4164   /* XXX Could improve efficiency by separating these all out using a
4165   * macro or in-line function.  At that point regcomp.c would no longer
4166   * have to set the FLAGS fields of these */
4167   case BOUNDL:  /*  /\b/l  */
4168   case NBOUNDL: /*  /\B/l  */
4169    RX_MATCH_TAINTED_on(reginfo->prog);
4170    /* FALL THROUGH */
4171   case BOUND:   /*  /\b/   */
4172   case BOUNDU:  /*  /\b/u  */
4173   case BOUNDA:  /*  /\b/a  */
4174   case NBOUND:  /*  /\B/   */
4175   case NBOUNDU: /*  /\B/u  */
4176   case NBOUNDA: /*  /\B/a  */
4177    /* was last char in word? */
4178    if (utf8_target
4179     && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4180     && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4181    {
4182     if (locinput == reginfo->strbeg)
4183      ln = '\n';
4184     else {
4185      const U8 * const r =
4186        reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4187
4188      ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4189     }
4190     if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4191      ln = isWORDCHAR_uni(ln);
4192      if (NEXTCHR_IS_EOS)
4193       n = 0;
4194      else {
4195       LOAD_UTF8_CHARCLASS_ALNUM();
4196       n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4197                 utf8_target);
4198      }
4199     }
4200     else {
4201      ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4202      n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4203     }
4204    }
4205    else {
4206
4207     /* Here the string isn't utf8, or is utf8 and only ascii
4208     * characters are to match \w.  In the latter case looking at
4209     * the byte just prior to the current one may be just the final
4210     * byte of a multi-byte character.  This is ok.  There are two
4211     * cases:
4212     * 1) it is a single byte character, and then the test is doing
4213     * just what it's supposed to.
4214     * 2) it is a multi-byte character, in which case the final
4215     * byte is never mistakable for ASCII, and so the test
4216     * will say it is not a word character, which is the
4217     * correct answer. */
4218     ln = (locinput != reginfo->strbeg) ?
4219      UCHARAT(locinput - 1) : '\n';
4220     switch (FLAGS(scan)) {
4221      case REGEX_UNICODE_CHARSET:
4222       ln = isWORDCHAR_L1(ln);
4223       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4224       break;
4225      case REGEX_LOCALE_CHARSET:
4226       ln = isWORDCHAR_LC(ln);
4227       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4228       break;
4229      case REGEX_DEPENDS_CHARSET:
4230       ln = isWORDCHAR(ln);
4231       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4232       break;
4233      case REGEX_ASCII_RESTRICTED_CHARSET:
4234      case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4235       ln = isWORDCHAR_A(ln);
4236       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4237       break;
4238      default:
4239       Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4240       break;
4241     }
4242    }
4243    /* Note requires that all BOUNDs be lower than all NBOUNDs in
4244    * regcomp.sym */
4245    if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4246      sayNO;
4247    break;
4248
4249   case ANYOF:  /*  /[abc]/       */
4250   case ANYOF_WARN_SUPER:
4251    if (NEXTCHR_IS_EOS)
4252     sayNO;
4253    if (utf8_target) {
4254     if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4255      sayNO;
4256     locinput += UTF8SKIP(locinput);
4257    }
4258    else {
4259     if (!REGINCLASS(rex, scan, (U8*)locinput))
4260      sayNO;
4261     locinput++;
4262    }
4263    break;
4264
4265   /* The argument (FLAGS) to all the POSIX node types is the class number
4266   * */
4267
4268   case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4269    to_complement = 1;
4270    /* FALLTHROUGH */
4271
4272   case POSIXL:    /* \w or [:punct:] etc. under /l */
4273    if (NEXTCHR_IS_EOS)
4274     sayNO;
4275
4276    /* The locale hasn't influenced the outcome before this, so defer
4277    * tainting until now */
4278    RX_MATCH_TAINTED_on(reginfo->prog);
4279
4280    /* Use isFOO_lc() for characters within Latin1.  (Note that
4281    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4282    * wouldn't be invariant) */
4283    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4284     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4285      sayNO;
4286     }
4287    }
4288    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4289     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4290           (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
4291                *(locinput + 1))))))
4292     {
4293      sayNO;
4294     }
4295    }
4296    else { /* Here, must be an above Latin-1 code point */
4297     goto utf8_posix_not_eos;
4298    }
4299
4300    /* Here, must be utf8 */
4301    locinput += UTF8SKIP(locinput);
4302    break;
4303
4304   case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4305    to_complement = 1;
4306    /* FALLTHROUGH */
4307
4308   case POSIXD:    /* \w or [:punct:] etc. under /d */
4309    if (utf8_target) {
4310     goto utf8_posix;
4311    }
4312    goto posixa;
4313
4314   case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4315
4316    if (NEXTCHR_IS_EOS) {
4317     sayNO;
4318    }
4319
4320    /* All UTF-8 variants match */
4321    if (! UTF8_IS_INVARIANT(nextchr)) {
4322     goto increment_locinput;
4323    }
4324
4325    to_complement = 1;
4326    /* FALLTHROUGH */
4327
4328   case POSIXA:    /* \w or [:punct:] etc. under /a */
4329
4330   posixa:
4331    /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4332    * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4333    * character is a single byte */
4334
4335    if (NEXTCHR_IS_EOS
4336     || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4337                FLAGS(scan)))))
4338    {
4339     sayNO;
4340    }
4341
4342    /* Here we are either not in utf8, or we matched a utf8-invariant,
4343    * so the next char is the next byte */
4344    locinput++;
4345    break;
4346
4347   case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4348    to_complement = 1;
4349    /* FALLTHROUGH */
4350
4351   case POSIXU:    /* \w or [:punct:] etc. under /u */
4352   utf8_posix:
4353    if (NEXTCHR_IS_EOS) {
4354     sayNO;
4355    }
4356   utf8_posix_not_eos:
4357
4358    /* Use _generic_isCC() for characters within Latin1.  (Note that
4359    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4360    * wouldn't be invariant) */
4361    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4362     if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4363               FLAGS(scan)))))
4364     {
4365      sayNO;
4366     }
4367     locinput++;
4368    }
4369    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4370     if (! (to_complement
4371      ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4372                *(locinput + 1)),
4373            FLAGS(scan)))))
4374     {
4375      sayNO;
4376     }
4377     locinput += 2;
4378    }
4379    else {  /* Handle above Latin-1 code points */
4380     classnum = (_char_class_number) FLAGS(scan);
4381     if (classnum < _FIRST_NON_SWASH_CC) {
4382
4383      /* Here, uses a swash to find such code points.  Load if if
4384      * not done already */
4385      if (! PL_utf8_swash_ptrs[classnum]) {
4386       U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4387       PL_utf8_swash_ptrs[classnum]
4388         = _core_swash_init("utf8",
4389           swash_property_names[classnum],
4390           &PL_sv_undef, 1, 0, NULL, &flags);
4391      }
4392      if (! (to_complement
4393       ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4394            (U8 *) locinput, TRUE))))
4395      {
4396       sayNO;
4397      }
4398     }
4399     else {  /* Here, uses macros to find above Latin-1 code points */
4400      switch (classnum) {
4401       case _CC_ENUM_SPACE:    /* XXX would require separate
4402             code if we revert the change
4403             of \v matching this */
4404       case _CC_ENUM_PSXSPC:
4405        if (! (to_complement
4406           ^ cBOOL(is_XPERLSPACE_high(locinput))))
4407        {
4408         sayNO;
4409        }
4410        break;
4411       case _CC_ENUM_BLANK:
4412        if (! (to_complement
4413            ^ cBOOL(is_HORIZWS_high(locinput))))
4414        {
4415         sayNO;
4416        }
4417        break;
4418       case _CC_ENUM_XDIGIT:
4419        if (! (to_complement
4420            ^ cBOOL(is_XDIGIT_high(locinput))))
4421        {
4422         sayNO;
4423        }
4424        break;
4425       case _CC_ENUM_VERTSPACE:
4426        if (! (to_complement
4427            ^ cBOOL(is_VERTWS_high(locinput))))
4428        {
4429         sayNO;
4430        }
4431        break;
4432       default:    /* The rest, e.g. [:cntrl:], can't match
4433          above Latin1 */
4434        if (! to_complement) {
4435         sayNO;
4436        }
4437        break;
4438      }
4439     }
4440     locinput += UTF8SKIP(locinput);
4441    }
4442    break;
4443
4444   case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4445      a Unicode extended Grapheme Cluster */
4446    /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4447    extended Grapheme Cluster is:
4448
4449    CR LF
4450    | Prepend* Begin Extend*
4451    | .
4452
4453    Begin is:           ( Special_Begin | ! Control )
4454    Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4455    Extend is:          ( Grapheme_Extend | Spacing_Mark )
4456    Control is:         [ GCB_Control | CR | LF ]
4457    Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4458
4459    If we create a 'Regular_Begin' = Begin - Special_Begin, then
4460    we can rewrite
4461
4462     Begin is ( Regular_Begin + Special Begin )
4463
4464    It turns out that 98.4% of all Unicode code points match
4465    Regular_Begin.  Doing it this way eliminates a table match in
4466    the previous implementation for almost all Unicode code points.
4467
4468    There is a subtlety with Prepend* which showed up in testing.
4469    Note that the Begin, and only the Begin is required in:
4470     | Prepend* Begin Extend*
4471    Also, Begin contains '! Control'.  A Prepend must be a
4472    '!  Control', which means it must also be a Begin.  What it
4473    comes down to is that if we match Prepend* and then find no
4474    suitable Begin afterwards, that if we backtrack the last
4475    Prepend, that one will be a suitable Begin.
4476    */
4477
4478    if (NEXTCHR_IS_EOS)
4479     sayNO;
4480    if  (! utf8_target) {
4481
4482     /* Match either CR LF  or '.', as all the other possibilities
4483     * require utf8 */
4484     locinput++;     /* Match the . or CR */
4485     if (nextchr == '\r' /* And if it was CR, and the next is LF,
4486          match the LF */
4487      && locinput < reginfo->strend
4488      && UCHARAT(locinput) == '\n')
4489     {
4490      locinput++;
4491     }
4492    }
4493    else {
4494
4495     /* Utf8: See if is ( CR LF ); already know that locinput <
4496     * reginfo->strend, so locinput+1 is in bounds */
4497     if ( nextchr == '\r' && locinput+1 < reginfo->strend
4498      && UCHARAT(locinput + 1) == '\n')
4499     {
4500      locinput += 2;
4501     }
4502     else {
4503      STRLEN len;
4504
4505      /* In case have to backtrack to beginning, then match '.' */
4506      char *starting = locinput;
4507
4508      /* In case have to backtrack the last prepend */
4509      char *previous_prepend = NULL;
4510
4511      LOAD_UTF8_CHARCLASS_GCB();
4512
4513      /* Match (prepend)*   */
4514      while (locinput < reginfo->strend
4515       && (len = is_GCB_Prepend_utf8(locinput)))
4516      {
4517       previous_prepend = locinput;
4518       locinput += len;
4519      }
4520
4521      /* As noted above, if we matched a prepend character, but
4522      * the next thing won't match, back off the last prepend we
4523      * matched, as it is guaranteed to match the begin */
4524      if (previous_prepend
4525       && (locinput >=  reginfo->strend
4526        || (! swash_fetch(PL_utf8_X_regular_begin,
4527            (U8*)locinput, utf8_target)
4528         && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4529       )
4530      {
4531       locinput = previous_prepend;
4532      }
4533
4534      /* Note that here we know reginfo->strend > locinput, as we
4535      * tested that upon input to this switch case, and if we
4536      * moved locinput forward, we tested the result just above
4537      * and it either passed, or we backed off so that it will
4538      * now pass */
4539      if (swash_fetch(PL_utf8_X_regular_begin,
4540          (U8*)locinput, utf8_target)) {
4541       locinput += UTF8SKIP(locinput);
4542      }
4543      else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4544
4545       /* Here did not match the required 'Begin' in the
4546       * second term.  So just match the very first
4547       * character, the '.' of the final term of the regex */
4548       locinput = starting + UTF8SKIP(starting);
4549       goto exit_utf8;
4550      } else {
4551
4552       /* Here is a special begin.  It can be composed of
4553       * several individual characters.  One possibility is
4554       * RI+ */
4555       if ((len = is_GCB_RI_utf8(locinput))) {
4556        locinput += len;
4557        while (locinput < reginfo->strend
4558         && (len = is_GCB_RI_utf8(locinput)))
4559        {
4560         locinput += len;
4561        }
4562       } else if ((len = is_GCB_T_utf8(locinput))) {
4563        /* Another possibility is T+ */
4564        locinput += len;
4565        while (locinput < reginfo->strend
4566         && (len = is_GCB_T_utf8(locinput)))
4567        {
4568         locinput += len;
4569        }
4570       } else {
4571
4572        /* Here, neither RI+ nor T+; must be some other
4573        * Hangul.  That means it is one of the others: L,
4574        * LV, LVT or V, and matches:
4575        * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4576
4577        /* Match L*           */
4578        while (locinput < reginfo->strend
4579         && (len = is_GCB_L_utf8(locinput)))
4580        {
4581         locinput += len;
4582        }
4583
4584        /* Here, have exhausted L*.  If the next character
4585        * is not an LV, LVT nor V, it means we had to have
4586        * at least one L, so matches L+ in the original
4587        * equation, we have a complete hangul syllable.
4588        * Are done. */
4589
4590        if (locinput < reginfo->strend
4591         && is_GCB_LV_LVT_V_utf8(locinput))
4592        {
4593         /* Otherwise keep going.  Must be LV, LVT or V.
4594         * See if LVT, by first ruling out V, then LV */
4595         if (! is_GCB_V_utf8(locinput)
4596           /* All but every TCount one is LV */
4597          && (valid_utf8_to_uvchr((U8 *) locinput,
4598                   NULL)
4599                   - SBASE)
4600           % TCount != 0)
4601         {
4602          locinput += UTF8SKIP(locinput);
4603         } else {
4604
4605          /* Must be  V or LV.  Take it, then match
4606          * V*     */
4607          locinput += UTF8SKIP(locinput);
4608          while (locinput < reginfo->strend
4609           && (len = is_GCB_V_utf8(locinput)))
4610          {
4611           locinput += len;
4612          }
4613         }
4614
4615         /* And any of LV, LVT, or V can be followed
4616         * by T*            */
4617         while (locinput < reginfo->strend
4618          && (len = is_GCB_T_utf8(locinput)))
4619         {
4620          locinput += len;
4621         }
4622        }
4623       }
4624      }
4625
4626      /* Match any extender */
4627      while (locinput < reginfo->strend
4628        && swash_fetch(PL_utf8_X_extend,
4629            (U8*)locinput, utf8_target))
4630      {
4631       locinput += UTF8SKIP(locinput);
4632      }
4633     }
4634    exit_utf8:
4635     if (locinput > reginfo->strend) sayNO;
4636    }
4637    break;
4638
4639   case NREFFL:  /*  /\g{name}/il  */
4640   {   /* The capture buffer cases.  The ones beginning with N for the
4641    named buffers just convert to the equivalent numbered and
4642    pretend they were called as the corresponding numbered buffer
4643    op.  */
4644    /* don't initialize these in the declaration, it makes C++
4645    unhappy */
4646    const char *s;
4647    char type;
4648    re_fold_t folder;
4649    const U8 *fold_array;
4650    UV utf8_fold_flags;
4651
4652    RX_MATCH_TAINTED_on(reginfo->prog);
4653    folder = foldEQ_locale;
4654    fold_array = PL_fold_locale;
4655    type = REFFL;
4656    utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4657    goto do_nref;
4658
4659   case NREFFA:  /*  /\g{name}/iaa  */
4660    folder = foldEQ_latin1;
4661    fold_array = PL_fold_latin1;
4662    type = REFFA;
4663    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4664    goto do_nref;
4665
4666   case NREFFU:  /*  /\g{name}/iu  */
4667    folder = foldEQ_latin1;
4668    fold_array = PL_fold_latin1;
4669    type = REFFU;
4670    utf8_fold_flags = 0;
4671    goto do_nref;
4672
4673   case NREFF:  /*  /\g{name}/i  */
4674    folder = foldEQ;
4675    fold_array = PL_fold;
4676    type = REFF;
4677    utf8_fold_flags = 0;
4678    goto do_nref;
4679
4680   case NREF:  /*  /\g{name}/   */
4681    type = REF;
4682    folder = NULL;
4683    fold_array = NULL;
4684    utf8_fold_flags = 0;
4685   do_nref:
4686
4687    /* For the named back references, find the corresponding buffer
4688    * number */
4689    n = reg_check_named_buff_matched(rex,scan);
4690
4691    if ( ! n ) {
4692     sayNO;
4693    }
4694    goto do_nref_ref_common;
4695
4696   case REFFL:  /*  /\1/il  */
4697    RX_MATCH_TAINTED_on(reginfo->prog);
4698    folder = foldEQ_locale;
4699    fold_array = PL_fold_locale;
4700    utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4701    goto do_ref;
4702
4703   case REFFA:  /*  /\1/iaa  */
4704    folder = foldEQ_latin1;
4705    fold_array = PL_fold_latin1;
4706    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4707    goto do_ref;
4708
4709   case REFFU:  /*  /\1/iu  */
4710    folder = foldEQ_latin1;
4711    fold_array = PL_fold_latin1;
4712    utf8_fold_flags = 0;
4713    goto do_ref;
4714
4715   case REFF:  /*  /\1/i  */
4716    folder = foldEQ;
4717    fold_array = PL_fold;
4718    utf8_fold_flags = 0;
4719    goto do_ref;
4720
4721   case REF:  /*  /\1/    */
4722    folder = NULL;
4723    fold_array = NULL;
4724    utf8_fold_flags = 0;
4725
4726   do_ref:
4727    type = OP(scan);
4728    n = ARG(scan);  /* which paren pair */
4729
4730   do_nref_ref_common:
4731    ln = rex->offs[n].start;
4732    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
4733    if (rex->lastparen < n || ln == -1)
4734     sayNO;   /* Do not match unless seen CLOSEn. */
4735    if (ln == rex->offs[n].end)
4736     break;
4737
4738    s = reginfo->strbeg + ln;
4739    if (type != REF /* REF can do byte comparison */
4740     && (utf8_target || type == REFFU))
4741    { /* XXX handle REFFL better */
4742     char * limit = reginfo->strend;
4743
4744     /* This call case insensitively compares the entire buffer
4745      * at s, with the current input starting at locinput, but
4746      * not going off the end given by reginfo->strend, and
4747      * returns in <limit> upon success, how much of the
4748      * current input was matched */
4749     if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4750          locinput, &limit, 0, utf8_target, utf8_fold_flags))
4751     {
4752      sayNO;
4753     }
4754     locinput = limit;
4755     break;
4756    }
4757
4758    /* Not utf8:  Inline the first character, for speed. */
4759    if (!NEXTCHR_IS_EOS &&
4760     UCHARAT(s) != nextchr &&
4761     (type == REF ||
4762     UCHARAT(s) != fold_array[nextchr]))
4763     sayNO;
4764    ln = rex->offs[n].end - ln;
4765    if (locinput + ln > reginfo->strend)
4766     sayNO;
4767    if (ln > 1 && (type == REF
4768       ? memNE(s, locinput, ln)
4769       : ! folder(s, locinput, ln)))
4770     sayNO;
4771    locinput += ln;
4772    break;
4773   }
4774
4775   case NOTHING: /* null op; e.g. the 'nothing' following
4776      * the '*' in m{(a+|b)*}' */
4777    break;
4778   case TAIL: /* placeholder while compiling (A|B|C) */
4779    break;
4780
4781   case BACK: /* ??? doesn't appear to be used ??? */
4782    break;
4783
4784 #undef  ST
4785 #define ST st->u.eval
4786   {
4787    SV *ret;
4788    REGEXP *re_sv;
4789    regexp *re;
4790    regexp_internal *rei;
4791    regnode *startpoint;
4792
4793   case GOSTART: /*  (?R)  */
4794   case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4795    if (cur_eval && cur_eval->locinput==locinput) {
4796     if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4797      Perl_croak(aTHX_ "Infinite recursion in regex");
4798     if ( ++nochange_depth > max_nochange_depth )
4799      Perl_croak(aTHX_
4800       "Pattern subroutine nesting without pos change"
4801       " exceeded limit in regex");
4802    } else {
4803     nochange_depth = 0;
4804    }
4805    re_sv = rex_sv;
4806    re = rex;
4807    rei = rexi;
4808    if (OP(scan)==GOSUB) {
4809     startpoint = scan + ARG2L(scan);
4810     ST.close_paren = ARG(scan);
4811    } else {
4812     startpoint = rei->program+1;
4813     ST.close_paren = 0;
4814    }
4815    goto eval_recurse_doit;
4816    assert(0); /* NOTREACHED */
4817
4818   case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4819    if (cur_eval && cur_eval->locinput==locinput) {
4820     if ( ++nochange_depth > max_nochange_depth )
4821      Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4822    } else {
4823     nochange_depth = 0;
4824    }
4825    {
4826     /* execute the code in the {...} */
4827
4828     dSP;
4829     IV before;
4830     OP * const oop = PL_op;
4831     COP * const ocurcop = PL_curcop;
4832     OP *nop;
4833     CV *newcv;
4834
4835     /* save *all* paren positions */
4836     regcppush(rex, 0, maxopenparen);
4837     REGCP_SET(runops_cp);
4838
4839     if (!caller_cv)
4840      caller_cv = find_runcv(NULL);
4841
4842     n = ARG(scan);
4843
4844     if (rexi->data->what[n] == 'r') { /* code from an external qr */
4845      newcv = (ReANY(
4846             (REGEXP*)(rexi->data->data[n])
4847            ))->qr_anoncv
4848           ;
4849      nop = (OP*)rexi->data->data[n+1];
4850     }
4851     else if (rexi->data->what[n] == 'l') { /* literal code */
4852      newcv = caller_cv;
4853      nop = (OP*)rexi->data->data[n];
4854      assert(CvDEPTH(newcv));
4855     }
4856     else {
4857      /* literal with own CV */
4858      assert(rexi->data->what[n] == 'L');
4859      newcv = rex->qr_anoncv;
4860      nop = (OP*)rexi->data->data[n];
4861     }
4862
4863     /* normally if we're about to execute code from the same
4864     * CV that we used previously, we just use the existing
4865     * CX stack entry. However, its possible that in the
4866     * meantime we may have backtracked, popped from the save
4867     * stack, and undone the SAVECOMPPAD(s) associated with
4868     * PUSH_MULTICALL; in which case PL_comppad no longer
4869     * points to newcv's pad. */
4870     if (newcv != last_pushed_cv || PL_comppad != last_pad)
4871     {
4872      U8 flags = (CXp_SUB_RE |
4873         ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
4874      if (last_pushed_cv) {
4875       CHANGE_MULTICALL_FLAGS(newcv, flags);
4876      }
4877      else {
4878       PUSH_MULTICALL_FLAGS(newcv, flags);
4879      }
4880      last_pushed_cv = newcv;
4881     }
4882     else {
4883      /* these assignments are just to silence compiler
4884      * warnings */
4885      multicall_cop = NULL;
4886      newsp = NULL;
4887     }
4888     last_pad = PL_comppad;
4889
4890     /* the initial nextstate you would normally execute
4891     * at the start of an eval (which would cause error
4892     * messages to come from the eval), may be optimised
4893     * away from the execution path in the regex code blocks;
4894     * so manually set PL_curcop to it initially */
4895     {
4896      OP *o = cUNOPx(nop)->op_first;
4897      assert(o->op_type == OP_NULL);
4898      if (o->op_targ == OP_SCOPE) {
4899       o = cUNOPo->op_first;
4900      }
4901      else {
4902       assert(o->op_targ == OP_LEAVE);
4903       o = cUNOPo->op_first;
4904       assert(o->op_type == OP_ENTER);
4905       o = o->op_sibling;
4906      }
4907
4908      if (o->op_type != OP_STUB) {
4909       assert(    o->op_type == OP_NEXTSTATE
4910         || o->op_type == OP_DBSTATE
4911         || (o->op_type == OP_NULL
4912          &&  (  o->op_targ == OP_NEXTSTATE
4913           || o->op_targ == OP_DBSTATE
4914           )
4915          )
4916       );
4917       PL_curcop = (COP*)o;
4918      }
4919     }
4920     nop = nop->op_next;
4921
4922     DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4923      "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4924
4925     rex->offs[0].end = locinput - reginfo->strbeg;
4926     if (reginfo->info_aux_eval->pos_magic)
4927       reginfo->info_aux_eval->pos_magic->mg_len
4928           = locinput - reginfo->strbeg;
4929
4930     if (sv_yes_mark) {
4931      SV *sv_mrk = get_sv("REGMARK", 1);
4932      sv_setsv(sv_mrk, sv_yes_mark);
4933     }
4934
4935     /* we don't use MULTICALL here as we want to call the
4936     * first op of the block of interest, rather than the
4937     * first op of the sub */
4938     before = (IV)(SP-PL_stack_base);
4939     PL_op = nop;
4940     CALLRUNOPS(aTHX);   /* Scalar context. */
4941     SPAGAIN;
4942     if ((IV)(SP-PL_stack_base) == before)
4943      ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4944     else {
4945      ret = POPs;
4946      PUTBACK;
4947     }
4948
4949     /* before restoring everything, evaluate the returned
4950     * value, so that 'uninit' warnings don't use the wrong
4951     * PL_op or pad. Also need to process any magic vars
4952     * (e.g. $1) *before* parentheses are restored */
4953
4954     PL_op = NULL;
4955
4956     re_sv = NULL;
4957     if (logical == 0)        /*   (?{})/   */
4958      sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4959     else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4960      sw = cBOOL(SvTRUE(ret));
4961      logical = 0;
4962     }
4963     else {                   /*  /(??{})  */
4964      /*  if its overloaded, let the regex compiler handle
4965      *  it; otherwise extract regex, or stringify  */
4966      if (!SvAMAGIC(ret)) {
4967       SV *sv = ret;
4968       if (SvROK(sv))
4969        sv = SvRV(sv);
4970       if (SvTYPE(sv) == SVt_REGEXP)
4971        re_sv = (REGEXP*) sv;
4972       else if (SvSMAGICAL(sv)) {
4973        MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4974        if (mg)
4975         re_sv = (REGEXP *) mg->mg_obj;
4976       }
4977
4978       /* force any magic, undef warnings here */
4979       if (!re_sv) {
4980        ret = sv_mortalcopy(ret);
4981        (void) SvPV_force_nolen(ret);
4982       }
4983      }
4984
4985     }
4986
4987     /* *** Note that at this point we don't restore
4988     * PL_comppad, (or pop the CxSUB) on the assumption it may
4989     * be used again soon. This is safe as long as nothing
4990     * in the regexp code uses the pad ! */
4991     PL_op = oop;
4992     PL_curcop = ocurcop;
4993     S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
4994     PL_curpm = PL_reg_curpm;
4995
4996     if (logical != 2)
4997      break;
4998    }
4999
5000     /* only /(??{})/  from now on */
5001     logical = 0;
5002     {
5003      /* extract RE object from returned value; compiling if
5004      * necessary */
5005
5006      if (re_sv) {
5007       re_sv = reg_temp_copy(NULL, re_sv);
5008      }
5009      else {
5010       U32 pm_flags = 0;
5011
5012       if (SvUTF8(ret) && IN_BYTES) {
5013        /* In use 'bytes': make a copy of the octet
5014        * sequence, but without the flag on */
5015        STRLEN len;
5016        const char *const p = SvPV(ret, len);
5017        ret = newSVpvn_flags(p, len, SVs_TEMP);
5018       }
5019       if (rex->intflags & PREGf_USE_RE_EVAL)
5020        pm_flags |= PMf_USE_RE_EVAL;
5021
5022       /* if we got here, it should be an engine which
5023       * supports compiling code blocks and stuff */
5024       assert(rex->engine && rex->engine->op_comp);
5025       assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5026       re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5027          rex->engine, NULL, NULL,
5028          /* copy /msix etc to inner pattern */
5029          scan->flags,
5030          pm_flags);
5031
5032       if (!(SvFLAGS(ret)
5033        & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5034         | SVs_GMG))) {
5035        /* This isn't a first class regexp. Instead, it's
5036        caching a regexp onto an existing, Perl visible
5037        scalar.  */
5038        sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5039       }
5040       /* safe to do now that any $1 etc has been
5041       * interpolated into the new pattern string and
5042       * compiled */
5043       S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5044      }
5045      SAVEFREESV(re_sv);
5046      re = ReANY(re_sv);
5047     }
5048     RXp_MATCH_COPIED_off(re);
5049     re->subbeg = rex->subbeg;
5050     re->sublen = rex->sublen;
5051     re->suboffset = rex->suboffset;
5052     re->subcoffset = rex->subcoffset;
5053     rei = RXi_GET(re);
5054     DEBUG_EXECUTE_r(
5055      debug_start_match(re_sv, utf8_target, locinput,
5056          reginfo->strend, "Matching embedded");
5057     );
5058     startpoint = rei->program + 1;
5059      ST.close_paren = 0; /* only used for GOSUB */
5060
5061   eval_recurse_doit: /* Share code with GOSUB below this line */
5062     /* run the pattern returned from (??{...}) */
5063
5064     /* Save *all* the positions. */
5065     ST.cp = regcppush(rex, 0, maxopenparen);
5066     REGCP_SET(ST.lastcp);
5067
5068     re->lastparen = 0;
5069     re->lastcloseparen = 0;
5070
5071     maxopenparen = 0;
5072
5073     /* invalidate the S-L poscache. We're now executing a
5074     * different set of WHILEM ops (and their associated
5075     * indexes) against the same string, so the bits in the
5076     * cache are meaningless. Setting maxiter to zero forces
5077     * the cache to be invalidated and zeroed before reuse.
5078     * XXX This is too dramatic a measure. Ideally we should
5079     * save the old cache and restore when running the outer
5080     * pattern again */
5081     reginfo->poscache_maxiter = 0;
5082
5083     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5084
5085     ST.prev_rex = rex_sv;
5086     ST.prev_curlyx = cur_curlyx;
5087     rex_sv = re_sv;
5088     SET_reg_curpm(rex_sv);
5089     rex = re;
5090     rexi = rei;
5091     cur_curlyx = NULL;
5092     ST.B = next;
5093     ST.prev_eval = cur_eval;
5094     cur_eval = st;
5095     /* now continue from first node in postoned RE */
5096     PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5097     assert(0); /* NOTREACHED */
5098   }
5099
5100   case EVAL_AB: /* cleanup after a successful (??{A})B */
5101    /* note: this is called twice; first after popping B, then A */
5102    rex_sv = ST.prev_rex;
5103    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5104    SET_reg_curpm(rex_sv);
5105    rex = ReANY(rex_sv);
5106    rexi = RXi_GET(rex);
5107    regcpblow(ST.cp);
5108    cur_eval = ST.prev_eval;
5109    cur_curlyx = ST.prev_curlyx;
5110
5111    /* Invalidate cache. See "invalidate" comment above. */
5112    reginfo->poscache_maxiter = 0;
5113    if ( nochange_depth )
5114     nochange_depth--;
5115    sayYES;
5116
5117
5118   case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5119    /* note: this is called twice; first after popping B, then A */
5120    rex_sv = ST.prev_rex;
5121    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5122    SET_reg_curpm(rex_sv);
5123    rex = ReANY(rex_sv);
5124    rexi = RXi_GET(rex);
5125
5126    REGCP_UNWIND(ST.lastcp);
5127    regcppop(rex, &maxopenparen);
5128    cur_eval = ST.prev_eval;
5129    cur_curlyx = ST.prev_curlyx;
5130    /* Invalidate cache. See "invalidate" comment above. */
5131    reginfo->poscache_maxiter = 0;
5132    if ( nochange_depth )
5133     nochange_depth--;
5134    sayNO_SILENT;
5135 #undef ST
5136
5137   case OPEN: /*  (  */
5138    n = ARG(scan);  /* which paren pair */
5139    rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5140    if (n > maxopenparen)
5141     maxopenparen = n;
5142    DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5143     "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5144     PTR2UV(rex),
5145     PTR2UV(rex->offs),
5146     (UV)n,
5147     (IV)rex->offs[n].start_tmp,
5148     (UV)maxopenparen
5149    ));
5150    lastopen = n;
5151    break;
5152
5153 /* XXX really need to log other places start/end are set too */
5154 #define CLOSE_CAPTURE \
5155  rex->offs[n].start = rex->offs[n].start_tmp; \
5156  rex->offs[n].end = locinput - reginfo->strbeg; \
5157  DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5158   "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5159   PTR2UV(rex), \
5160   PTR2UV(rex->offs), \
5161   (UV)n, \
5162   (IV)rex->offs[n].start, \
5163   (IV)rex->offs[n].end \
5164  ))
5165
5166   case CLOSE:  /*  )  */
5167    n = ARG(scan);  /* which paren pair */
5168    CLOSE_CAPTURE;
5169    if (n > rex->lastparen)
5170     rex->lastparen = n;
5171    rex->lastcloseparen = n;
5172    if (cur_eval && cur_eval->u.eval.close_paren == n) {
5173     goto fake_end;
5174    }
5175    break;
5176
5177   case ACCEPT:  /*  (*ACCEPT)  */
5178    if (ARG(scan)){
5179     regnode *cursor;
5180     for (cursor=scan;
5181      cursor && OP(cursor)!=END;
5182      cursor=regnext(cursor))
5183     {
5184      if ( OP(cursor)==CLOSE ){
5185       n = ARG(cursor);
5186       if ( n <= lastopen ) {
5187        CLOSE_CAPTURE;
5188        if (n > rex->lastparen)
5189         rex->lastparen = n;
5190        rex->lastcloseparen = n;
5191        if ( n == ARG(scan) || (cur_eval &&
5192         cur_eval->u.eval.close_paren == n))
5193         break;
5194       }
5195      }
5196     }
5197    }
5198    goto fake_end;
5199    /*NOTREACHED*/
5200
5201   case GROUPP:  /*  (?(1))  */
5202    n = ARG(scan);  /* which paren pair */
5203    sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5204    break;
5205
5206   case NGROUPP:  /*  (?(<name>))  */
5207    /* reg_check_named_buff_matched returns 0 for no match */
5208    sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5209    break;
5210
5211   case INSUBP:   /*  (?(R))  */
5212    n = ARG(scan);
5213    sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5214    break;
5215
5216   case DEFINEP:  /*  (?(DEFINE))  */
5217    sw = 0;
5218    break;
5219
5220   case IFTHEN:   /*  (?(cond)A|B)  */
5221    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5222    if (sw)
5223     next = NEXTOPER(NEXTOPER(scan));
5224    else {
5225     next = scan + ARG(scan);
5226     if (OP(next) == IFTHEN) /* Fake one. */
5227      next = NEXTOPER(NEXTOPER(next));
5228    }
5229    break;
5230
5231   case LOGICAL:  /* modifier for EVAL and IFMATCH */
5232    logical = scan->flags;
5233    break;
5234
5235 /*******************************************************************
5236
5237 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5238 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5239 STAR/PLUS/CURLY/CURLYN are used instead.)
5240
5241 A*B is compiled as <CURLYX><A><WHILEM><B>
5242
5243 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5244 state, which contains the current count, initialised to -1. It also sets
5245 cur_curlyx to point to this state, with any previous value saved in the
5246 state block.
5247
5248 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5249 since the pattern may possibly match zero times (i.e. it's a while {} loop
5250 rather than a do {} while loop).
5251
5252 Each entry to WHILEM represents a successful match of A. The count in the
5253 CURLYX block is incremented, another WHILEM state is pushed, and execution
5254 passes to A or B depending on greediness and the current count.
5255
5256 For example, if matching against the string a1a2a3b (where the aN are
5257 substrings that match /A/), then the match progresses as follows: (the
5258 pushed states are interspersed with the bits of strings matched so far):
5259
5260  <CURLYX cnt=-1>
5261  <CURLYX cnt=0><WHILEM>
5262  <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5263  <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5264  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5265  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5266
5267 (Contrast this with something like CURLYM, which maintains only a single
5268 backtrack state:
5269
5270  <CURLYM cnt=0> a1
5271  a1 <CURLYM cnt=1> a2
5272  a1 a2 <CURLYM cnt=2> a3
5273  a1 a2 a3 <CURLYM cnt=3> b
5274 )
5275
5276 Each WHILEM state block marks a point to backtrack to upon partial failure
5277 of A or B, and also contains some minor state data related to that
5278 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5279 overall state, such as the count, and pointers to the A and B ops.
5280
5281 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5282 must always point to the *current* CURLYX block, the rules are:
5283
5284 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5285 and set cur_curlyx to point the new block.
5286
5287 When popping the CURLYX block after a successful or unsuccessful match,
5288 restore the previous cur_curlyx.
5289
5290 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5291 to the outer one saved in the CURLYX block.
5292
5293 When popping the WHILEM block after a successful or unsuccessful B match,
5294 restore the previous cur_curlyx.
5295
5296 Here's an example for the pattern (AI* BI)*BO
5297 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5298
5299 cur_
5300 curlyx backtrack stack
5301 ------ ---------------
5302 NULL
5303 CO     <CO prev=NULL> <WO>
5304 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5305 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5306 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5307
5308 At this point the pattern succeeds, and we work back down the stack to
5309 clean up, restoring as we go:
5310
5311 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5312 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5313 CO     <CO prev=NULL> <WO>
5314 NULL
5315
5316 *******************************************************************/
5317
5318 #define ST st->u.curlyx
5319
5320   case CURLYX:    /* start of /A*B/  (for complex A) */
5321   {
5322    /* No need to save/restore up to this paren */
5323    I32 parenfloor = scan->flags;
5324
5325    assert(next); /* keep Coverity happy */
5326    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5327     next += ARG(next);
5328
5329    /* XXXX Probably it is better to teach regpush to support
5330    parenfloor > maxopenparen ... */
5331    if (parenfloor > (I32)rex->lastparen)
5332     parenfloor = rex->lastparen; /* Pessimization... */
5333
5334    ST.prev_curlyx= cur_curlyx;
5335    cur_curlyx = st;
5336    ST.cp = PL_savestack_ix;
5337
5338    /* these fields contain the state of the current curly.
5339    * they are accessed by subsequent WHILEMs */
5340    ST.parenfloor = parenfloor;
5341    ST.me = scan;
5342    ST.B = next;
5343    ST.minmod = minmod;
5344    minmod = 0;
5345    ST.count = -1; /* this will be updated by WHILEM */
5346    ST.lastloc = NULL;  /* this will be updated by WHILEM */
5347
5348    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5349    assert(0); /* NOTREACHED */
5350   }
5351
5352   case CURLYX_end: /* just finished matching all of A*B */
5353    cur_curlyx = ST.prev_curlyx;
5354    sayYES;
5355    assert(0); /* NOTREACHED */
5356
5357   case CURLYX_end_fail: /* just failed to match all of A*B */
5358    regcpblow(ST.cp);
5359    cur_curlyx = ST.prev_curlyx;
5360    sayNO;
5361    assert(0); /* NOTREACHED */
5362
5363
5364 #undef ST
5365 #define ST st->u.whilem
5366
5367   case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5368   {
5369    /* see the discussion above about CURLYX/WHILEM */
5370    I32 n;
5371    int min = ARG1(cur_curlyx->u.curlyx.me);
5372    int max = ARG2(cur_curlyx->u.curlyx.me);
5373    regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5374
5375    assert(cur_curlyx); /* keep Coverity happy */
5376    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5377    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5378    ST.cache_offset = 0;
5379    ST.cache_mask = 0;
5380
5381
5382    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5383     "%*s  whilem: matched %ld out of %d..%d\n",
5384     REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5385    );
5386
5387    /* First just match a string of min A's. */
5388
5389    if (n < min) {
5390     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5391          maxopenparen);
5392     cur_curlyx->u.curlyx.lastloc = locinput;
5393     REGCP_SET(ST.lastcp);
5394
5395     PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5396     assert(0); /* NOTREACHED */
5397    }
5398
5399    /* If degenerate A matches "", assume A done. */
5400
5401    if (locinput == cur_curlyx->u.curlyx.lastloc) {
5402     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5403     "%*s  whilem: empty match detected, trying continuation...\n",
5404     REPORT_CODE_OFF+depth*2, "")
5405     );
5406     goto do_whilem_B_max;
5407    }
5408
5409    /* super-linear cache processing.
5410    *
5411    * The idea here is that for certain types of CURLYX/WHILEM -
5412    * principally those whose upper bound is infinity (and
5413    * excluding regexes that have things like \1 and other very
5414    * non-regular expresssiony things), then if a pattern like
5415    * /....A*.../ fails and we backtrack to the WHILEM, then we
5416    * make a note that this particular WHILEM op was at string
5417    * position 47 (say) when the rest of pattern failed. Then, if
5418    * we ever find ourselves back at that WHILEM, and at string
5419    * position 47 again, we can just fail immediately rather than
5420    * running the rest of the pattern again.
5421    *
5422    * This is very handy when patterns start to go
5423    * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5424    * with a combinatorial explosion of backtracking.
5425    *
5426    * The cache is implemented as a bit array, with one bit per
5427    * string byte position per WHILEM op (up to 16) - so its
5428    * between 0.25 and 2x the string size.
5429    *
5430    * To avoid allocating a poscache buffer every time, we do an
5431    * initially countdown; only after we have  executed a WHILEM
5432    * op (string-length x #WHILEMs) times do we allocate the
5433    * cache.
5434    *
5435    * The top 4 bits of scan->flags byte say how many different
5436    * relevant CURLLYX/WHILEM op pairs there are, while the
5437    * bottom 4-bits is the identifying index number of this
5438    * WHILEM.
5439    */
5440
5441    if (scan->flags) {
5442
5443     if (!reginfo->poscache_maxiter) {
5444      /* start the countdown: Postpone detection until we
5445      * know the match is not *that* much linear. */
5446      reginfo->poscache_maxiter
5447       =    (reginfo->strend - reginfo->strbeg + 1)
5448       * (scan->flags>>4);
5449      /* possible overflow for long strings and many CURLYX's */
5450      if (reginfo->poscache_maxiter < 0)
5451       reginfo->poscache_maxiter = I32_MAX;
5452      reginfo->poscache_iter = reginfo->poscache_maxiter;
5453     }
5454
5455     if (reginfo->poscache_iter-- == 0) {
5456      /* initialise cache */
5457      const I32 size = (reginfo->poscache_maxiter + 7)/8;
5458      regmatch_info_aux *const aux = reginfo->info_aux;
5459      if (aux->poscache) {
5460       if ((I32)reginfo->poscache_size < size) {
5461        Renew(aux->poscache, size, char);
5462        reginfo->poscache_size = size;
5463       }
5464       Zero(aux->poscache, size, char);
5465      }
5466      else {
5467       reginfo->poscache_size = size;
5468       Newxz(aux->poscache, size, char);
5469      }
5470      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5471  "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5472        PL_colors[4], PL_colors[5])
5473      );
5474     }
5475
5476     if (reginfo->poscache_iter < 0) {
5477      /* have we already failed at this position? */
5478      I32 offset, mask;
5479
5480      reginfo->poscache_iter = -1; /* stop eventual underflow */
5481      offset  = (scan->flags & 0xf) - 1
5482         +   (locinput - reginfo->strbeg)
5483         * (scan->flags>>4);
5484      mask    = 1 << (offset % 8);
5485      offset /= 8;
5486      if (reginfo->info_aux->poscache[offset] & mask) {
5487       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5488        "%*s  whilem: (cache) already tried at this position...\n",
5489        REPORT_CODE_OFF+depth*2, "")
5490       );
5491       sayNO; /* cache records failure */
5492      }
5493      ST.cache_offset = offset;
5494      ST.cache_mask   = mask;
5495     }
5496    }
5497
5498    /* Prefer B over A for minimal matching. */
5499
5500    if (cur_curlyx->u.curlyx.minmod) {
5501     ST.save_curlyx = cur_curlyx;
5502     cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5503     ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5504        maxopenparen);
5505     REGCP_SET(ST.lastcp);
5506     PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5507          locinput);
5508     assert(0); /* NOTREACHED */
5509    }
5510
5511    /* Prefer A over B for maximal matching. */
5512
5513    if (n < max) { /* More greed allowed? */
5514     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5515        maxopenparen);
5516     cur_curlyx->u.curlyx.lastloc = locinput;
5517     REGCP_SET(ST.lastcp);
5518     PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5519     assert(0); /* NOTREACHED */
5520    }
5521    goto do_whilem_B_max;
5522   }
5523   assert(0); /* NOTREACHED */
5524
5525   case WHILEM_B_min: /* just matched B in a minimal match */
5526   case WHILEM_B_max: /* just matched B in a maximal match */
5527    cur_curlyx = ST.save_curlyx;
5528    sayYES;
5529    assert(0); /* NOTREACHED */
5530
5531   case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5532    cur_curlyx = ST.save_curlyx;
5533    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5534    cur_curlyx->u.curlyx.count--;
5535    CACHEsayNO;
5536    assert(0); /* NOTREACHED */
5537
5538   case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5539    /* FALL THROUGH */
5540   case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5541    REGCP_UNWIND(ST.lastcp);
5542    regcppop(rex, &maxopenparen);
5543    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5544    cur_curlyx->u.curlyx.count--;
5545    CACHEsayNO;
5546    assert(0); /* NOTREACHED */
5547
5548   case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5549    REGCP_UNWIND(ST.lastcp);
5550    regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5551    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5552     "%*s  whilem: failed, trying continuation...\n",
5553     REPORT_CODE_OFF+depth*2, "")
5554    );
5555   do_whilem_B_max:
5556    if (cur_curlyx->u.curlyx.count >= REG_INFTY
5557     && ckWARN(WARN_REGEXP)
5558     && !reginfo->warned)
5559    {
5560     reginfo->warned = TRUE;
5561     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5562      "Complex regular subexpression recursion limit (%d) "
5563      "exceeded",
5564      REG_INFTY - 1);
5565    }
5566
5567    /* now try B */
5568    ST.save_curlyx = cur_curlyx;
5569    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5570    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5571         locinput);
5572    assert(0); /* NOTREACHED */
5573
5574   case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5575    cur_curlyx = ST.save_curlyx;
5576    REGCP_UNWIND(ST.lastcp);
5577    regcppop(rex, &maxopenparen);
5578
5579    if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5580     /* Maximum greed exceeded */
5581     if (cur_curlyx->u.curlyx.count >= REG_INFTY
5582      && ckWARN(WARN_REGEXP)
5583      && !reginfo->warned)
5584     {
5585      reginfo->warned = TRUE;
5586      Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5587       "Complex regular subexpression recursion "
5588       "limit (%d) exceeded",
5589       REG_INFTY - 1);
5590     }
5591     cur_curlyx->u.curlyx.count--;
5592     CACHEsayNO;
5593    }
5594
5595    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5596     "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5597    );
5598    /* Try grabbing another A and see if it helps. */
5599    cur_curlyx->u.curlyx.lastloc = locinput;
5600    ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5601        maxopenparen);
5602    REGCP_SET(ST.lastcp);
5603    PUSH_STATE_GOTO(WHILEM_A_min,
5604     /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5605     locinput);
5606    assert(0); /* NOTREACHED */
5607
5608 #undef  ST
5609 #define ST st->u.branch
5610
5611   case BRANCHJ:     /*  /(...|A|...)/ with long next pointer */
5612    next = scan + ARG(scan);
5613    if (next == scan)
5614     next = NULL;
5615    scan = NEXTOPER(scan);
5616    /* FALL THROUGH */
5617
5618   case BRANCH:     /*  /(...|A|...)/ */
5619    scan = NEXTOPER(scan); /* scan now points to inner node */
5620    ST.lastparen = rex->lastparen;
5621    ST.lastcloseparen = rex->lastcloseparen;
5622    ST.next_branch = next;
5623    REGCP_SET(ST.cp);
5624
5625    /* Now go into the branch */
5626    if (has_cutgroup) {
5627     PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5628    } else {
5629     PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5630    }
5631    assert(0); /* NOTREACHED */
5632
5633   case CUTGROUP:  /*  /(*THEN)/  */
5634    sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5635     MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5636    PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5637    assert(0); /* NOTREACHED */
5638
5639   case CUTGROUP_next_fail:
5640    do_cutgroup = 1;
5641    no_final = 1;
5642    if (st->u.mark.mark_name)
5643     sv_commit = st->u.mark.mark_name;
5644    sayNO;
5645    assert(0); /* NOTREACHED */
5646
5647   case BRANCH_next:
5648    sayYES;
5649    assert(0); /* NOTREACHED */
5650
5651   case BRANCH_next_fail: /* that branch failed; try the next, if any */
5652    if (do_cutgroup) {
5653     do_cutgroup = 0;
5654     no_final = 0;
5655    }
5656    REGCP_UNWIND(ST.cp);
5657    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5658    scan = ST.next_branch;
5659    /* no more branches? */
5660    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5661     DEBUG_EXECUTE_r({
5662      PerlIO_printf( Perl_debug_log,
5663       "%*s  %sBRANCH failed...%s\n",
5664       REPORT_CODE_OFF+depth*2, "",
5665       PL_colors[4],
5666       PL_colors[5] );
5667     });
5668     sayNO_SILENT;
5669    }
5670    continue; /* execute next BRANCH[J] op */
5671    assert(0); /* NOTREACHED */
5672
5673   case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5674    minmod = 1;
5675    break;
5676
5677 #undef  ST
5678 #define ST st->u.curlym
5679
5680   case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5681
5682    /* This is an optimisation of CURLYX that enables us to push
5683    * only a single backtracking state, no matter how many matches
5684    * there are in {m,n}. It relies on the pattern being constant
5685    * length, with no parens to influence future backrefs
5686    */
5687
5688    ST.me = scan;
5689    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5690
5691    ST.lastparen      = rex->lastparen;
5692    ST.lastcloseparen = rex->lastcloseparen;
5693
5694    /* if paren positive, emulate an OPEN/CLOSE around A */
5695    if (ST.me->flags) {
5696     U32 paren = ST.me->flags;
5697     if (paren > maxopenparen)
5698      maxopenparen = paren;
5699     scan += NEXT_OFF(scan); /* Skip former OPEN. */
5700    }
5701    ST.A = scan;
5702    ST.B = next;
5703    ST.alen = 0;
5704    ST.count = 0;
5705    ST.minmod = minmod;
5706    minmod = 0;
5707    ST.c1 = CHRTEST_UNINIT;
5708    REGCP_SET(ST.cp);
5709
5710    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5711     goto curlym_do_B;
5712
5713   curlym_do_A: /* execute the A in /A{m,n}B/  */
5714    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5715    assert(0); /* NOTREACHED */
5716
5717   case CURLYM_A: /* we've just matched an A */
5718    ST.count++;
5719    /* after first match, determine A's length: u.curlym.alen */
5720    if (ST.count == 1) {
5721     if (reginfo->is_utf8_target) {
5722      char *s = st->locinput;
5723      while (s < locinput) {
5724       ST.alen++;
5725       s += UTF8SKIP(s);
5726      }
5727     }
5728     else {
5729      ST.alen = locinput - st->locinput;
5730     }
5731     if (ST.alen == 0)
5732      ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5733    }
5734    DEBUG_EXECUTE_r(
5735     PerlIO_printf(Perl_debug_log,
5736       "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5737       (int)(REPORT_CODE_OFF+(depth*2)), "",
5738       (IV) ST.count, (IV)ST.alen)
5739    );
5740
5741    if (cur_eval && cur_eval->u.eval.close_paren &&
5742     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5743     goto fake_end;
5744
5745    {
5746     I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5747     if ( max == REG_INFTY || ST.count < max )
5748      goto curlym_do_A; /* try to match another A */
5749    }
5750    goto curlym_do_B; /* try to match B */
5751
5752   case CURLYM_A_fail: /* just failed to match an A */
5753    REGCP_UNWIND(ST.cp);
5754
5755    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5756     || (cur_eval && cur_eval->u.eval.close_paren &&
5757      cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5758     sayNO;
5759
5760   curlym_do_B: /* execute the B in /A{m,n}B/  */
5761    if (ST.c1 == CHRTEST_UNINIT) {
5762     /* calculate c1 and c2 for possible match of 1st char
5763     * following curly */
5764     ST.c1 = ST.c2 = CHRTEST_VOID;
5765     if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5766      regnode *text_node = ST.B;
5767      if (! HAS_TEXT(text_node))
5768       FIND_NEXT_IMPT(text_node);
5769      /* this used to be
5770
5771       (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5772
5773        But the former is redundant in light of the latter.
5774
5775        if this changes back then the macro for
5776        IS_TEXT and friends need to change.
5777      */
5778      if (PL_regkind[OP(text_node)] == EXACT) {
5779       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5780       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5781       reginfo))
5782       {
5783        sayNO;
5784       }
5785      }
5786     }
5787    }
5788
5789    DEBUG_EXECUTE_r(
5790     PerlIO_printf(Perl_debug_log,
5791      "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5792      (int)(REPORT_CODE_OFF+(depth*2)),
5793      "", (IV)ST.count)
5794     );
5795    if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5796     if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5797      if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5798       && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5799      {
5800       /* simulate B failing */
5801       DEBUG_OPTIMISE_r(
5802        PerlIO_printf(Perl_debug_log,
5803         "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5804         (int)(REPORT_CODE_OFF+(depth*2)),"",
5805         valid_utf8_to_uvchr((U8 *) locinput, NULL),
5806         valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5807         valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5808       );
5809       state_num = CURLYM_B_fail;
5810       goto reenter_switch;
5811      }
5812     }
5813     else if (nextchr != ST.c1 && nextchr != ST.c2) {
5814      /* simulate B failing */
5815      DEBUG_OPTIMISE_r(
5816       PerlIO_printf(Perl_debug_log,
5817        "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5818        (int)(REPORT_CODE_OFF+(depth*2)),"",
5819        (int) nextchr, ST.c1, ST.c2)
5820      );
5821      state_num = CURLYM_B_fail;
5822      goto reenter_switch;
5823     }
5824    }
5825
5826    if (ST.me->flags) {
5827     /* emulate CLOSE: mark current A as captured */
5828     I32 paren = ST.me->flags;
5829     if (ST.count) {
5830      rex->offs[paren].start
5831       = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5832      rex->offs[paren].end = locinput - reginfo->strbeg;
5833      if ((U32)paren > rex->lastparen)
5834       rex->lastparen = paren;
5835      rex->lastcloseparen = paren;
5836     }
5837     else
5838      rex->offs[paren].end = -1;
5839     if (cur_eval && cur_eval->u.eval.close_paren &&
5840      cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5841     {
5842      if (ST.count)
5843       goto fake_end;
5844      else
5845       sayNO;
5846     }
5847    }
5848
5849    PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5850    assert(0); /* NOTREACHED */
5851
5852   case CURLYM_B_fail: /* just failed to match a B */
5853    REGCP_UNWIND(ST.cp);
5854    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5855    if (ST.minmod) {
5856     I32 max = ARG2(ST.me);
5857     if (max != REG_INFTY && ST.count == max)
5858      sayNO;
5859     goto curlym_do_A; /* try to match a further A */
5860    }
5861    /* backtrack one A */
5862    if (ST.count == ARG1(ST.me) /* min */)
5863     sayNO;
5864    ST.count--;
5865    SET_locinput(HOPc(locinput, -ST.alen));
5866    goto curlym_do_B; /* try to match B */
5867
5868 #undef ST
5869 #define ST st->u.curly
5870
5871 #define CURLY_SETPAREN(paren, success) \
5872  if (paren) { \
5873   if (success) { \
5874    rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
5875    rex->offs[paren].end = locinput - reginfo->strbeg; \
5876    if (paren > rex->lastparen) \
5877     rex->lastparen = paren; \
5878    rex->lastcloseparen = paren; \
5879   } \
5880   else { \
5881    rex->offs[paren].end = -1; \
5882    rex->lastparen      = ST.lastparen; \
5883    rex->lastcloseparen = ST.lastcloseparen; \
5884   } \
5885  }
5886
5887   case STAR:  /*  /A*B/ where A is width 1 char */
5888    ST.paren = 0;
5889    ST.min = 0;
5890    ST.max = REG_INFTY;
5891    scan = NEXTOPER(scan);
5892    goto repeat;
5893
5894   case PLUS:  /*  /A+B/ where A is width 1 char */
5895    ST.paren = 0;
5896    ST.min = 1;
5897    ST.max = REG_INFTY;
5898    scan = NEXTOPER(scan);
5899    goto repeat;
5900
5901   case CURLYN:  /*  /(A){m,n}B/ where A is width 1 char */
5902    ST.paren = scan->flags; /* Which paren to set */
5903    ST.lastparen      = rex->lastparen;
5904    ST.lastcloseparen = rex->lastcloseparen;
5905    if (ST.paren > maxopenparen)
5906     maxopenparen = ST.paren;
5907    ST.min = ARG1(scan);  /* min to match */
5908    ST.max = ARG2(scan);  /* max to match */
5909    if (cur_eval && cur_eval->u.eval.close_paren &&
5910     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5911     ST.min=1;
5912     ST.max=1;
5913    }
5914    scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5915    goto repeat;
5916
5917   case CURLY:  /*  /A{m,n}B/ where A is width 1 char */
5918    ST.paren = 0;
5919    ST.min = ARG1(scan);  /* min to match */
5920    ST.max = ARG2(scan);  /* max to match */
5921    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5922   repeat:
5923    /*
5924    * Lookahead to avoid useless match attempts
5925    * when we know what character comes next.
5926    *
5927    * Used to only do .*x and .*?x, but now it allows
5928    * for )'s, ('s and (?{ ... })'s to be in the way
5929    * of the quantifier and the EXACT-like node.  -- japhy
5930    */
5931
5932    assert(ST.min <= ST.max);
5933    if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5934     ST.c1 = ST.c2 = CHRTEST_VOID;
5935    }
5936    else {
5937     regnode *text_node = next;
5938
5939     if (! HAS_TEXT(text_node))
5940      FIND_NEXT_IMPT(text_node);
5941
5942     if (! HAS_TEXT(text_node))
5943      ST.c1 = ST.c2 = CHRTEST_VOID;
5944     else {
5945      if ( PL_regkind[OP(text_node)] != EXACT ) {
5946       ST.c1 = ST.c2 = CHRTEST_VOID;
5947      }
5948      else {
5949
5950      /*  Currently we only get here when
5951
5952       PL_rekind[OP(text_node)] == EXACT
5953
5954       if this changes back then the macro for IS_TEXT and
5955       friends need to change. */
5956       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5957       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5958       reginfo))
5959       {
5960        sayNO;
5961       }
5962      }
5963     }
5964    }
5965
5966    ST.A = scan;
5967    ST.B = next;
5968    if (minmod) {
5969     char *li = locinput;
5970     minmod = 0;
5971     if (ST.min &&
5972       regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
5973        < ST.min)
5974      sayNO;
5975     SET_locinput(li);
5976     ST.count = ST.min;
5977     REGCP_SET(ST.cp);
5978     if (ST.c1 == CHRTEST_VOID)
5979      goto curly_try_B_min;
5980
5981     ST.oldloc = locinput;
5982
5983     /* set ST.maxpos to the furthest point along the
5984     * string that could possibly match */
5985     if  (ST.max == REG_INFTY) {
5986      ST.maxpos = reginfo->strend - 1;
5987      if (utf8_target)
5988       while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5989        ST.maxpos--;
5990     }
5991     else if (utf8_target) {
5992      int m = ST.max - ST.min;
5993      for (ST.maxpos = locinput;
5994       m >0 && ST.maxpos < reginfo->strend; m--)
5995       ST.maxpos += UTF8SKIP(ST.maxpos);
5996     }
5997     else {
5998      ST.maxpos = locinput + ST.max - ST.min;
5999      if (ST.maxpos >= reginfo->strend)
6000       ST.maxpos = reginfo->strend - 1;
6001     }
6002     goto curly_try_B_min_known;
6003
6004    }
6005    else {
6006     /* avoid taking address of locinput, so it can remain
6007     * a register var */
6008     char *li = locinput;
6009     ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6010     if (ST.count < ST.min)
6011      sayNO;
6012     SET_locinput(li);
6013     if ((ST.count > ST.min)
6014      && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6015     {
6016      /* A{m,n} must come at the end of the string, there's
6017      * no point in backing off ... */
6018      ST.min = ST.count;
6019      /* ...except that $ and \Z can match before *and* after
6020      newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6021      We may back off by one in this case. */
6022      if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6023       ST.min--;
6024     }
6025     REGCP_SET(ST.cp);
6026     goto curly_try_B_max;
6027    }
6028    assert(0); /* NOTREACHED */
6029
6030
6031   case CURLY_B_min_known_fail:
6032    /* failed to find B in a non-greedy match where c1,c2 valid */
6033
6034    REGCP_UNWIND(ST.cp);
6035    if (ST.paren) {
6036     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6037    }
6038    /* Couldn't or didn't -- move forward. */
6039    ST.oldloc = locinput;
6040    if (utf8_target)
6041     locinput += UTF8SKIP(locinput);
6042    else
6043     locinput++;
6044    ST.count++;
6045   curly_try_B_min_known:
6046    /* find the next place where 'B' could work, then call B */
6047    {
6048     int n;
6049     if (utf8_target) {
6050      n = (ST.oldloc == locinput) ? 0 : 1;
6051      if (ST.c1 == ST.c2) {
6052       /* set n to utf8_distance(oldloc, locinput) */
6053       while (locinput <= ST.maxpos
6054        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6055       {
6056        locinput += UTF8SKIP(locinput);
6057        n++;
6058       }
6059      }
6060      else {
6061       /* set n to utf8_distance(oldloc, locinput) */
6062       while (locinput <= ST.maxpos
6063        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6064        && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6065       {
6066        locinput += UTF8SKIP(locinput);
6067        n++;
6068       }
6069      }
6070     }
6071     else {  /* Not utf8_target */
6072      if (ST.c1 == ST.c2) {
6073       while (locinput <= ST.maxpos &&
6074        UCHARAT(locinput) != ST.c1)
6075        locinput++;
6076      }
6077      else {
6078       while (locinput <= ST.maxpos
6079        && UCHARAT(locinput) != ST.c1
6080        && UCHARAT(locinput) != ST.c2)
6081        locinput++;
6082      }
6083      n = locinput - ST.oldloc;
6084     }
6085     if (locinput > ST.maxpos)
6086      sayNO;
6087     if (n) {
6088      /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6089      * at b; check that everything between oldloc and
6090      * locinput matches */
6091      char *li = ST.oldloc;
6092      ST.count += n;
6093      if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6094       sayNO;
6095      assert(n == REG_INFTY || locinput == li);
6096     }
6097     CURLY_SETPAREN(ST.paren, ST.count);
6098     if (cur_eval && cur_eval->u.eval.close_paren &&
6099      cur_eval->u.eval.close_paren == (U32)ST.paren) {
6100      goto fake_end;
6101     }
6102     PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6103    }
6104    assert(0); /* NOTREACHED */
6105
6106
6107   case CURLY_B_min_fail:
6108    /* failed to find B in a non-greedy match where c1,c2 invalid */
6109
6110    REGCP_UNWIND(ST.cp);
6111    if (ST.paren) {
6112     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6113    }
6114    /* failed -- move forward one */
6115    {
6116     char *li = locinput;
6117     if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6118      sayNO;
6119     }
6120     locinput = li;
6121    }
6122    {
6123     ST.count++;
6124     if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6125       ST.count > 0)) /* count overflow ? */
6126     {
6127     curly_try_B_min:
6128      CURLY_SETPAREN(ST.paren, ST.count);
6129      if (cur_eval && cur_eval->u.eval.close_paren &&
6130       cur_eval->u.eval.close_paren == (U32)ST.paren) {
6131       goto fake_end;
6132      }
6133      PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6134     }
6135    }
6136    sayNO;
6137    assert(0); /* NOTREACHED */
6138
6139
6140   curly_try_B_max:
6141    /* a successful greedy match: now try to match B */
6142    if (cur_eval && cur_eval->u.eval.close_paren &&
6143     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6144     goto fake_end;
6145    }
6146    {
6147     bool could_match = locinput < reginfo->strend;
6148
6149     /* If it could work, try it. */
6150     if (ST.c1 != CHRTEST_VOID && could_match) {
6151      if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6152      {
6153       could_match = memEQ(locinput,
6154            ST.c1_utf8,
6155            UTF8SKIP(locinput))
6156          || memEQ(locinput,
6157            ST.c2_utf8,
6158            UTF8SKIP(locinput));
6159      }
6160      else {
6161       could_match = UCHARAT(locinput) == ST.c1
6162          || UCHARAT(locinput) == ST.c2;
6163      }
6164     }
6165     if (ST.c1 == CHRTEST_VOID || could_match) {
6166      CURLY_SETPAREN(ST.paren, ST.count);
6167      PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6168      assert(0); /* NOTREACHED */
6169     }
6170    }
6171    /* FALL THROUGH */
6172
6173   case CURLY_B_max_fail:
6174    /* failed to find B in a greedy match */
6175
6176    REGCP_UNWIND(ST.cp);
6177    if (ST.paren) {
6178     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6179    }
6180    /*  back up. */
6181    if (--ST.count < ST.min)
6182     sayNO;
6183    locinput = HOPc(locinput, -1);
6184    goto curly_try_B_max;
6185
6186 #undef ST
6187
6188   case END: /*  last op of main pattern  */
6189    fake_end:
6190    if (cur_eval) {
6191     /* we've just finished A in /(??{A})B/; now continue with B */
6192
6193     st->u.eval.prev_rex = rex_sv;  /* inner */
6194
6195     /* Save *all* the positions. */
6196     st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6197     rex_sv = cur_eval->u.eval.prev_rex;
6198     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6199     SET_reg_curpm(rex_sv);
6200     rex = ReANY(rex_sv);
6201     rexi = RXi_GET(rex);
6202     cur_curlyx = cur_eval->u.eval.prev_curlyx;
6203
6204     REGCP_SET(st->u.eval.lastcp);
6205
6206     /* Restore parens of the outer rex without popping the
6207     * savestack */
6208     S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6209           &maxopenparen);
6210
6211     st->u.eval.prev_eval = cur_eval;
6212     cur_eval = cur_eval->u.eval.prev_eval;
6213     DEBUG_EXECUTE_r(
6214      PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6215          REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6216     if ( nochange_depth )
6217      nochange_depth--;
6218
6219     PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6220          locinput); /* match B */
6221    }
6222
6223    if (locinput < reginfo->till) {
6224     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6225          "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6226          PL_colors[4],
6227          (long)(locinput - startpos),
6228          (long)(reginfo->till - startpos),
6229          PL_colors[5]));
6230
6231     sayNO_SILENT;  /* Cannot match: too short. */
6232    }
6233    sayYES;   /* Success! */
6234
6235   case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6236    DEBUG_EXECUTE_r(
6237    PerlIO_printf(Perl_debug_log,
6238     "%*s  %ssubpattern success...%s\n",
6239     REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6240    sayYES;   /* Success! */
6241
6242 #undef  ST
6243 #define ST st->u.ifmatch
6244
6245   {
6246    char *newstart;
6247
6248   case SUSPEND: /* (?>A) */
6249    ST.wanted = 1;
6250    newstart = locinput;
6251    goto do_ifmatch;
6252
6253   case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6254    ST.wanted = 0;
6255    goto ifmatch_trivial_fail_test;
6256
6257   case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6258    ST.wanted = 1;
6259   ifmatch_trivial_fail_test:
6260    if (scan->flags) {
6261     char * const s = HOPBACKc(locinput, scan->flags);
6262     if (!s) {
6263      /* trivial fail */
6264      if (logical) {
6265       logical = 0;
6266       sw = 1 - cBOOL(ST.wanted);
6267      }
6268      else if (ST.wanted)
6269       sayNO;
6270      next = scan + ARG(scan);
6271      if (next == scan)
6272       next = NULL;
6273      break;
6274     }
6275     newstart = s;
6276    }
6277    else
6278     newstart = locinput;
6279
6280   do_ifmatch:
6281    ST.me = scan;
6282    ST.logical = logical;
6283    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6284
6285    /* execute body of (?...A) */
6286    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6287    assert(0); /* NOTREACHED */
6288   }
6289
6290   case IFMATCH_A_fail: /* body of (?...A) failed */
6291    ST.wanted = !ST.wanted;
6292    /* FALL THROUGH */
6293
6294   case IFMATCH_A: /* body of (?...A) succeeded */
6295    if (ST.logical) {
6296     sw = cBOOL(ST.wanted);
6297    }
6298    else if (!ST.wanted)
6299     sayNO;
6300
6301    if (OP(ST.me) != SUSPEND) {
6302     /* restore old position except for (?>...) */
6303     locinput = st->locinput;
6304    }
6305    scan = ST.me + ARG(ST.me);
6306    if (scan == ST.me)
6307     scan = NULL;
6308    continue; /* execute B */
6309
6310 #undef ST
6311
6312   case LONGJMP: /*  alternative with many branches compiles to
6313      * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6314    next = scan + ARG(scan);
6315    if (next == scan)
6316     next = NULL;
6317    break;
6318
6319   case COMMIT:  /*  (*COMMIT)  */
6320    reginfo->cutpoint = reginfo->strend;
6321    /* FALLTHROUGH */
6322
6323   case PRUNE:   /*  (*PRUNE)   */
6324    if (!scan->flags)
6325     sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6326    PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6327    assert(0); /* NOTREACHED */
6328
6329   case COMMIT_next_fail:
6330    no_final = 1;
6331    /* FALLTHROUGH */
6332
6333   case OPFAIL:   /* (*FAIL)  */
6334    sayNO;
6335    assert(0); /* NOTREACHED */
6336
6337 #define ST st->u.mark
6338   case MARKPOINT: /*  (*MARK:foo)  */
6339    ST.prev_mark = mark_state;
6340    ST.mark_name = sv_commit = sv_yes_mark
6341     = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6342    mark_state = st;
6343    ST.mark_loc = locinput;
6344    PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6345    assert(0); /* NOTREACHED */
6346
6347   case MARKPOINT_next:
6348    mark_state = ST.prev_mark;
6349    sayYES;
6350    assert(0); /* NOTREACHED */
6351
6352   case MARKPOINT_next_fail:
6353    if (popmark && sv_eq(ST.mark_name,popmark))
6354    {
6355     if (ST.mark_loc > startpoint)
6356      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6357     popmark = NULL; /* we found our mark */
6358     sv_commit = ST.mark_name;
6359
6360     DEBUG_EXECUTE_r({
6361       PerlIO_printf(Perl_debug_log,
6362        "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6363        REPORT_CODE_OFF+depth*2, "",
6364        PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6365     });
6366    }
6367    mark_state = ST.prev_mark;
6368    sv_yes_mark = mark_state ?
6369     mark_state->u.mark.mark_name : NULL;
6370    sayNO;
6371    assert(0); /* NOTREACHED */
6372
6373   case SKIP:  /*  (*SKIP)  */
6374    if (scan->flags) {
6375     /* (*SKIP) : if we fail we cut here*/
6376     ST.mark_name = NULL;
6377     ST.mark_loc = locinput;
6378     PUSH_STATE_GOTO(SKIP_next,next, locinput);
6379    } else {
6380     /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6381     otherwise do nothing.  Meaning we need to scan
6382     */
6383     regmatch_state *cur = mark_state;
6384     SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6385
6386     while (cur) {
6387      if ( sv_eq( cur->u.mark.mark_name,
6388         find ) )
6389      {
6390       ST.mark_name = find;
6391       PUSH_STATE_GOTO( SKIP_next, next, locinput);
6392      }
6393      cur = cur->u.mark.prev_mark;
6394     }
6395    }
6396    /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6397    break;
6398
6399   case SKIP_next_fail:
6400    if (ST.mark_name) {
6401     /* (*CUT:NAME) - Set up to search for the name as we
6402     collapse the stack*/
6403     popmark = ST.mark_name;
6404    } else {
6405     /* (*CUT) - No name, we cut here.*/
6406     if (ST.mark_loc > startpoint)
6407      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6408     /* but we set sv_commit to latest mark_name if there
6409     is one so they can test to see how things lead to this
6410     cut */
6411     if (mark_state)
6412      sv_commit=mark_state->u.mark.mark_name;
6413    }
6414    no_final = 1;
6415    sayNO;
6416    assert(0); /* NOTREACHED */
6417 #undef ST
6418
6419   case LNBREAK: /* \R */
6420    if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6421     locinput += n;
6422    } else
6423     sayNO;
6424    break;
6425
6426   default:
6427    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6428       PTR2UV(scan), OP(scan));
6429    Perl_croak(aTHX_ "regexp memory corruption");
6430
6431   /* this is a point to jump to in order to increment
6432   * locinput by one character */
6433   increment_locinput:
6434    assert(!NEXTCHR_IS_EOS);
6435    if (utf8_target) {
6436     locinput += PL_utf8skip[nextchr];
6437     /* locinput is allowed to go 1 char off the end, but not 2+ */
6438     if (locinput > reginfo->strend)
6439      sayNO;
6440    }
6441    else
6442     locinput++;
6443    break;
6444
6445   } /* end switch */
6446
6447   /* switch break jumps here */
6448   scan = next; /* prepare to execute the next op and ... */
6449   continue;    /* ... jump back to the top, reusing st */
6450   assert(0); /* NOTREACHED */
6451
6452  push_yes_state:
6453   /* push a state that backtracks on success */
6454   st->u.yes.prev_yes_state = yes_state;
6455   yes_state = st;
6456   /* FALL THROUGH */
6457  push_state:
6458   /* push a new regex state, then continue at scan  */
6459   {
6460    regmatch_state *newst;
6461
6462    DEBUG_STACK_r({
6463     regmatch_state *cur = st;
6464     regmatch_state *curyes = yes_state;
6465     int curd = depth;
6466     regmatch_slab *slab = PL_regmatch_slab;
6467     for (;curd > -1;cur--,curd--) {
6468      if (cur < SLAB_FIRST(slab)) {
6469       slab = slab->prev;
6470       cur = SLAB_LAST(slab);
6471      }
6472      PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6473       REPORT_CODE_OFF + 2 + depth * 2,"",
6474       curd, PL_reg_name[cur->resume_state],
6475       (curyes == cur) ? "yes" : ""
6476      );
6477      if (curyes == cur)
6478       curyes = cur->u.yes.prev_yes_state;
6479     }
6480    } else
6481     DEBUG_STATE_pp("push")
6482    );
6483    depth++;
6484    st->locinput = locinput;
6485    newst = st+1;
6486    if (newst >  SLAB_LAST(PL_regmatch_slab))
6487     newst = S_push_slab(aTHX);
6488    PL_regmatch_state = newst;
6489
6490    locinput = pushinput;
6491    st = newst;
6492    continue;
6493    assert(0); /* NOTREACHED */
6494   }
6495  }
6496
6497  /*
6498  * We get here only if there's trouble -- normally "case END" is
6499  * the terminating point.
6500  */
6501  Perl_croak(aTHX_ "corrupted regexp pointers");
6502  /*NOTREACHED*/
6503  sayNO;
6504
6505 yes:
6506  if (yes_state) {
6507   /* we have successfully completed a subexpression, but we must now
6508   * pop to the state marked by yes_state and continue from there */
6509   assert(st != yes_state);
6510 #ifdef DEBUGGING
6511   while (st != yes_state) {
6512    st--;
6513    if (st < SLAB_FIRST(PL_regmatch_slab)) {
6514     PL_regmatch_slab = PL_regmatch_slab->prev;
6515     st = SLAB_LAST(PL_regmatch_slab);
6516    }
6517    DEBUG_STATE_r({
6518     if (no_final) {
6519      DEBUG_STATE_pp("pop (no final)");
6520     } else {
6521      DEBUG_STATE_pp("pop (yes)");
6522     }
6523    });
6524    depth--;
6525   }
6526 #else
6527   while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6528    || yes_state > SLAB_LAST(PL_regmatch_slab))
6529   {
6530    /* not in this slab, pop slab */
6531    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6532    PL_regmatch_slab = PL_regmatch_slab->prev;
6533    st = SLAB_LAST(PL_regmatch_slab);
6534   }
6535   depth -= (st - yes_state);
6536 #endif
6537   st = yes_state;
6538   yes_state = st->u.yes.prev_yes_state;
6539   PL_regmatch_state = st;
6540
6541   if (no_final)
6542    locinput= st->locinput;
6543   state_num = st->resume_state + no_final;
6544   goto reenter_switch;
6545  }
6546
6547  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6548       PL_colors[4], PL_colors[5]));
6549
6550  if (reginfo->info_aux_eval) {
6551   /* each successfully executed (?{...}) block does the equivalent of
6552   *   local $^R = do {...}
6553   * When popping the save stack, all these locals would be undone;
6554   * bypass this by setting the outermost saved $^R to the latest
6555   * value */
6556   if (oreplsv != GvSV(PL_replgv))
6557    sv_setsv(oreplsv, GvSV(PL_replgv));
6558  }
6559  result = 1;
6560  goto final_exit;
6561
6562 no:
6563  DEBUG_EXECUTE_r(
6564   PerlIO_printf(Perl_debug_log,
6565    "%*s  %sfailed...%s\n",
6566    REPORT_CODE_OFF+depth*2, "",
6567    PL_colors[4], PL_colors[5])
6568   );
6569
6570 no_silent:
6571  if (no_final) {
6572   if (yes_state) {
6573    goto yes;
6574   } else {
6575    goto final_exit;
6576   }
6577  }
6578  if (depth) {
6579   /* there's a previous state to backtrack to */
6580   st--;
6581   if (st < SLAB_FIRST(PL_regmatch_slab)) {
6582    PL_regmatch_slab = PL_regmatch_slab->prev;
6583    st = SLAB_LAST(PL_regmatch_slab);
6584   }
6585   PL_regmatch_state = st;
6586   locinput= st->locinput;
6587
6588   DEBUG_STATE_pp("pop");
6589   depth--;
6590   if (yes_state == st)
6591    yes_state = st->u.yes.prev_yes_state;
6592
6593   state_num = st->resume_state + 1; /* failure = success + 1 */
6594   goto reenter_switch;
6595  }
6596  result = 0;
6597
6598   final_exit:
6599  if (rex->intflags & PREGf_VERBARG_SEEN) {
6600   SV *sv_err = get_sv("REGERROR", 1);
6601   SV *sv_mrk = get_sv("REGMARK", 1);
6602   if (result) {
6603    sv_commit = &PL_sv_no;
6604    if (!sv_yes_mark)
6605     sv_yes_mark = &PL_sv_yes;
6606   } else {
6607    if (!sv_commit)
6608     sv_commit = &PL_sv_yes;
6609    sv_yes_mark = &PL_sv_no;
6610   }
6611   sv_setsv(sv_err, sv_commit);
6612   sv_setsv(sv_mrk, sv_yes_mark);
6613  }
6614
6615
6616  if (last_pushed_cv) {
6617   dSP;
6618   POP_MULTICALL;
6619   PERL_UNUSED_VAR(SP);
6620  }
6621
6622  assert(!result ||  locinput - reginfo->strbeg >= 0);
6623  return result ?  locinput - reginfo->strbeg : -1;
6624 }
6625
6626 /*
6627  - regrepeat - repeatedly match something simple, report how many
6628  *
6629  * What 'simple' means is a node which can be the operand of a quantifier like
6630  * '+', or {1,3}
6631  *
6632  * startposp - pointer a pointer to the start position.  This is updated
6633  *             to point to the byte following the highest successful
6634  *             match.
6635  * p         - the regnode to be repeatedly matched against.
6636  * reginfo   - struct holding match state, such as strend
6637  * max       - maximum number of things to match.
6638  * depth     - (for debugging) backtracking depth.
6639  */
6640 STATIC I32
6641 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6642    regmatch_info *const reginfo, I32 max, int depth)
6643 {
6644  dVAR;
6645  char *scan;     /* Pointer to current position in target string */
6646  I32 c;
6647  char *loceol = reginfo->strend;   /* local version */
6648  I32 hardcount = 0;  /* How many matches so far */
6649  bool utf8_target = reginfo->is_utf8_target;
6650  int to_complement = 0;  /* Invert the result? */
6651  UV utf8_flags;
6652  _char_class_number classnum;
6653 #ifndef DEBUGGING
6654  PERL_UNUSED_ARG(depth);
6655 #endif
6656
6657  PERL_ARGS_ASSERT_REGREPEAT;
6658
6659  scan = *startposp;
6660  if (max == REG_INFTY)
6661   max = I32_MAX;
6662  else if (! utf8_target && loceol - scan > max)
6663   loceol = scan + max;
6664
6665  /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6666  * to the maximum of how far we should go in it (leaving it set to the real
6667  * end, if the maximum permissible would take us beyond that).  This allows
6668  * us to make the loop exit condition that we haven't gone past <loceol> to
6669  * also mean that we haven't exceeded the max permissible count, saving a
6670  * test each time through the loop.  But it assumes that the OP matches a
6671  * single byte, which is true for most of the OPs below when applied to a
6672  * non-UTF-8 target.  Those relatively few OPs that don't have this
6673  * characteristic will have to compensate.
6674  *
6675  * There is no adjustment for UTF-8 targets, as the number of bytes per
6676  * character varies.  OPs will have to test both that the count is less
6677  * than the max permissible (using <hardcount> to keep track), and that we
6678  * are still within the bounds of the string (using <loceol>.  A few OPs
6679  * match a single byte no matter what the encoding.  They can omit the max
6680  * test if, for the UTF-8 case, they do the adjustment that was skipped
6681  * above.
6682  *
6683  * Thus, the code above sets things up for the common case; and exceptional
6684  * cases need extra work; the common case is to make sure <scan> doesn't
6685  * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6686  * count doesn't exceed the maximum permissible */
6687
6688  switch (OP(p)) {
6689  case REG_ANY:
6690   if (utf8_target) {
6691    while (scan < loceol && hardcount < max && *scan != '\n') {
6692     scan += UTF8SKIP(scan);
6693     hardcount++;
6694    }
6695   } else {
6696    while (scan < loceol && *scan != '\n')
6697     scan++;
6698   }
6699   break;
6700  case SANY:
6701   if (utf8_target) {
6702    while (scan < loceol && hardcount < max) {
6703     scan += UTF8SKIP(scan);
6704     hardcount++;
6705    }
6706   }
6707   else
6708    scan = loceol;
6709   break;
6710  case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6711   if (utf8_target && loceol - scan > max) {
6712
6713    /* <loceol> hadn't been adjusted in the UTF-8 case */
6714    scan +=  max;
6715   }
6716   else {
6717    scan = loceol;
6718   }
6719   break;
6720  case EXACT:
6721   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6722
6723   c = (U8)*STRING(p);
6724
6725   /* Can use a simple loop if the pattern char to match on is invariant
6726   * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6727   * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6728   * true iff it doesn't matter if the argument is in UTF-8 or not */
6729   if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
6730    if (utf8_target && loceol - scan > max) {
6731     /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6732     * since here, to match at all, 1 char == 1 byte */
6733     loceol = scan + max;
6734    }
6735    while (scan < loceol && UCHARAT(scan) == c) {
6736     scan++;
6737    }
6738   }
6739   else if (reginfo->is_utf8_pat) {
6740    if (utf8_target) {
6741     STRLEN scan_char_len;
6742
6743     /* When both target and pattern are UTF-8, we have to do
6744     * string EQ */
6745     while (hardcount < max
6746      && scan < loceol
6747      && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6748      && memEQ(scan, STRING(p), scan_char_len))
6749     {
6750      scan += scan_char_len;
6751      hardcount++;
6752     }
6753    }
6754    else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6755
6756     /* Target isn't utf8; convert the character in the UTF-8
6757     * pattern to non-UTF8, and do a simple loop */
6758     c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6759     while (scan < loceol && UCHARAT(scan) == c) {
6760      scan++;
6761     }
6762    } /* else pattern char is above Latin1, can't possibly match the
6763     non-UTF-8 target */
6764   }
6765   else {
6766
6767    /* Here, the string must be utf8; pattern isn't, and <c> is
6768    * different in utf8 than not, so can't compare them directly.
6769    * Outside the loop, find the two utf8 bytes that represent c, and
6770    * then look for those in sequence in the utf8 string */
6771    U8 high = UTF8_TWO_BYTE_HI(c);
6772    U8 low = UTF8_TWO_BYTE_LO(c);
6773
6774    while (hardcount < max
6775      && scan + 1 < loceol
6776      && UCHARAT(scan) == high
6777      && UCHARAT(scan + 1) == low)
6778    {
6779     scan += 2;
6780     hardcount++;
6781    }
6782   }
6783   break;
6784
6785  case EXACTFA:
6786   utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6787   goto do_exactf;
6788
6789  case EXACTFL:
6790   RXp_MATCH_TAINTED_on(prog);
6791   utf8_flags = FOLDEQ_UTF8_LOCALE;
6792   goto do_exactf;
6793
6794  case EXACTF:
6795    utf8_flags = 0;
6796    goto do_exactf;
6797
6798  case EXACTFU_SS:
6799  case EXACTFU_TRICKYFOLD:
6800  case EXACTFU:
6801   utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6802
6803  do_exactf: {
6804   int c1, c2;
6805   U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6806
6807   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6808
6809   if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6810           reginfo))
6811   {
6812    if (c1 == CHRTEST_VOID) {
6813     /* Use full Unicode fold matching */
6814     char *tmpeol = reginfo->strend;
6815     STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
6816     while (hardcount < max
6817       && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6818            STRING(p), NULL, pat_len,
6819            reginfo->is_utf8_pat, utf8_flags))
6820     {
6821      scan = tmpeol;
6822      tmpeol = reginfo->strend;
6823      hardcount++;
6824     }
6825    }
6826    else if (utf8_target) {
6827     if (c1 == c2) {
6828      while (scan < loceol
6829       && hardcount < max
6830       && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6831      {
6832       scan += UTF8SKIP(scan);
6833       hardcount++;
6834      }
6835     }
6836     else {
6837      while (scan < loceol
6838       && hardcount < max
6839       && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6840        || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6841      {
6842       scan += UTF8SKIP(scan);
6843       hardcount++;
6844      }
6845     }
6846    }
6847    else if (c1 == c2) {
6848     while (scan < loceol && UCHARAT(scan) == c1) {
6849      scan++;
6850     }
6851    }
6852    else {
6853     while (scan < loceol &&
6854      (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6855     {
6856      scan++;
6857     }
6858    }
6859   }
6860   break;
6861  }
6862  case ANYOF:
6863  case ANYOF_WARN_SUPER:
6864   if (utf8_target) {
6865    while (hardcount < max
6866     && scan < loceol
6867     && reginclass(prog, p, (U8*)scan, utf8_target))
6868    {
6869     scan += UTF8SKIP(scan);
6870     hardcount++;
6871    }
6872   } else {
6873    while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6874     scan++;
6875   }
6876   break;
6877
6878  /* The argument (FLAGS) to all the POSIX node types is the class number */
6879
6880  case NPOSIXL:
6881   to_complement = 1;
6882   /* FALLTHROUGH */
6883
6884  case POSIXL:
6885   RXp_MATCH_TAINTED_on(prog);
6886   if (! utf8_target) {
6887    while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6888                 *scan)))
6889    {
6890     scan++;
6891    }
6892   } else {
6893    while (hardcount < max && scan < loceol
6894     && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6895                 (U8 *) scan)))
6896    {
6897     scan += UTF8SKIP(scan);
6898     hardcount++;
6899    }
6900   }
6901   break;
6902
6903  case POSIXD:
6904   if (utf8_target) {
6905    goto utf8_posix;
6906   }
6907   /* FALLTHROUGH */
6908
6909  case POSIXA:
6910   if (utf8_target && loceol - scan > max) {
6911
6912    /* We didn't adjust <loceol> at the beginning of this routine
6913    * because is UTF-8, but it is actually ok to do so, since here, to
6914    * match, 1 char == 1 byte. */
6915    loceol = scan + max;
6916   }
6917   while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6918    scan++;
6919   }
6920   break;
6921
6922  case NPOSIXD:
6923   if (utf8_target) {
6924    to_complement = 1;
6925    goto utf8_posix;
6926   }
6927   /* FALL THROUGH */
6928
6929  case NPOSIXA:
6930   if (! utf8_target) {
6931    while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6932     scan++;
6933    }
6934   }
6935   else {
6936
6937    /* The complement of something that matches only ASCII matches all
6938    * UTF-8 variant code points, plus everything in ASCII that isn't
6939    * in the class. */
6940    while (hardcount < max && scan < loceol
6941     && (! UTF8_IS_INVARIANT(*scan)
6942      || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
6943    {
6944     scan += UTF8SKIP(scan);
6945     hardcount++;
6946    }
6947   }
6948   break;
6949
6950  case NPOSIXU:
6951   to_complement = 1;
6952   /* FALLTHROUGH */
6953
6954  case POSIXU:
6955   if (! utf8_target) {
6956    while (scan < loceol && to_complement
6957         ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
6958    {
6959     scan++;
6960    }
6961   }
6962   else {
6963  utf8_posix:
6964    classnum = (_char_class_number) FLAGS(p);
6965    if (classnum < _FIRST_NON_SWASH_CC) {
6966
6967     /* Here, a swash is needed for above-Latin1 code points.
6968     * Process as many Latin1 code points using the built-in rules.
6969     * Go to another loop to finish processing upon encountering
6970     * the first Latin1 code point.  We could do that in this loop
6971     * as well, but the other way saves having to test if the swash
6972     * has been loaded every time through the loop: extra space to
6973     * save a test. */
6974     while (hardcount < max && scan < loceol) {
6975      if (UTF8_IS_INVARIANT(*scan)) {
6976       if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
6977                 classnum))))
6978       {
6979        break;
6980       }
6981       scan++;
6982      }
6983      else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
6984       if (! (to_complement
6985        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
6986                 *(scan + 1)),
6987              classnum))))
6988       {
6989        break;
6990       }
6991       scan += 2;
6992      }
6993      else {
6994       goto found_above_latin1;
6995      }
6996
6997      hardcount++;
6998     }
6999    }
7000    else {
7001     /* For these character classes, the knowledge of how to handle
7002     * every code point is compiled in to Perl via a macro.  This
7003     * code is written for making the loops as tight as possible.
7004     * It could be refactored to save space instead */
7005     switch (classnum) {
7006      case _CC_ENUM_SPACE:    /* XXX would require separate code
7007            if we revert the change of \v
7008            matching this */
7009       /* FALL THROUGH */
7010      case _CC_ENUM_PSXSPC:
7011       while (hardcount < max
7012        && scan < loceol
7013        && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7014       {
7015        scan += UTF8SKIP(scan);
7016        hardcount++;
7017       }
7018       break;
7019      case _CC_ENUM_BLANK:
7020       while (hardcount < max
7021        && scan < loceol
7022        && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7023       {
7024        scan += UTF8SKIP(scan);
7025        hardcount++;
7026       }
7027       break;
7028      case _CC_ENUM_XDIGIT:
7029       while (hardcount < max
7030        && scan < loceol
7031        && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7032       {
7033        scan += UTF8SKIP(scan);
7034        hardcount++;
7035       }
7036       break;
7037      case _CC_ENUM_VERTSPACE:
7038       while (hardcount < max
7039        && scan < loceol
7040        && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7041       {
7042        scan += UTF8SKIP(scan);
7043        hardcount++;
7044       }
7045       break;
7046      case _CC_ENUM_CNTRL:
7047       while (hardcount < max
7048        && scan < loceol
7049        && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7050       {
7051        scan += UTF8SKIP(scan);
7052        hardcount++;
7053       }
7054       break;
7055      default:
7056       Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7057     }
7058    }
7059   }
7060   break;
7061
7062  found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7063
7064   /* Load the swash if not already present */
7065   if (! PL_utf8_swash_ptrs[classnum]) {
7066    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7067    PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7068           "utf8", swash_property_names[classnum],
7069           &PL_sv_undef, 1, 0, NULL, &flags);
7070   }
7071
7072   while (hardcount < max && scan < loceol
7073    && to_complement ^ cBOOL(_generic_utf8(
7074          classnum,
7075          scan,
7076          swash_fetch(PL_utf8_swash_ptrs[classnum],
7077             (U8 *) scan,
7078             TRUE))))
7079   {
7080    scan += UTF8SKIP(scan);
7081    hardcount++;
7082   }
7083   break;
7084
7085  case LNBREAK:
7086   if (utf8_target) {
7087    while (hardcount < max && scan < loceol &&
7088      (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7089     scan += c;
7090     hardcount++;
7091    }
7092   } else {
7093    /* LNBREAK can match one or two latin chars, which is ok, but we
7094    * have to use hardcount in this situation, and throw away the
7095    * adjustment to <loceol> done before the switch statement */
7096    loceol = reginfo->strend;
7097    while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7098     scan+=c;
7099     hardcount++;
7100    }
7101   }
7102   break;
7103
7104  case BOUND:
7105  case BOUNDA:
7106  case BOUNDL:
7107  case BOUNDU:
7108  case EOS:
7109  case GPOS:
7110  case KEEPS:
7111  case NBOUND:
7112  case NBOUNDA:
7113  case NBOUNDL:
7114  case NBOUNDU:
7115  case OPFAIL:
7116  case SBOL:
7117  case SEOL:
7118   /* These are all 0 width, so match right here or not at all. */
7119   break;
7120
7121  default:
7122   Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7123   assert(0); /* NOTREACHED */
7124
7125  }
7126
7127  if (hardcount)
7128   c = hardcount;
7129  else
7130   c = scan - *startposp;
7131  *startposp = scan;
7132
7133  DEBUG_r({
7134   GET_RE_DEBUG_FLAGS_DECL;
7135   DEBUG_EXECUTE_r({
7136    SV * const prop = sv_newmortal();
7137    regprop(prog, prop, p);
7138    PerlIO_printf(Perl_debug_log,
7139       "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7140       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7141   });
7142  });
7143
7144  return(c);
7145 }
7146
7147
7148 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7149 /*
7150 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7151 create a copy so that changes the caller makes won't change the shared one.
7152 If <altsvp> is non-null, will return NULL in it, for back-compat.
7153  */
7154 SV *
7155 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7156 {
7157  PERL_ARGS_ASSERT_REGCLASS_SWASH;
7158
7159  if (altsvp) {
7160   *altsvp = NULL;
7161  }
7162
7163  return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7164 }
7165 #endif
7166
7167 STATIC SV *
7168 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7169 {
7170  /* Returns the swash for the input 'node' in the regex 'prog'.
7171  * If <doinit> is true, will attempt to create the swash if not already
7172  *   done.
7173  * If <listsvp> is non-null, will return the swash initialization string in
7174  *   it.
7175  * Tied intimately to how regcomp.c sets up the data structure */
7176
7177  dVAR;
7178  SV *sw  = NULL;
7179  SV *si  = NULL;
7180  SV*  invlist = NULL;
7181
7182  RXi_GET_DECL(prog,progi);
7183  const struct reg_data * const data = prog ? progi->data : NULL;
7184
7185  PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7186
7187  assert(ANYOF_NONBITMAP(node));
7188
7189  if (data && data->count) {
7190   const U32 n = ARG(node);
7191
7192   if (data->what[n] == 's') {
7193    SV * const rv = MUTABLE_SV(data->data[n]);
7194    AV * const av = MUTABLE_AV(SvRV(rv));
7195    SV **const ary = AvARRAY(av);
7196    U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7197
7198    si = *ary; /* ary[0] = the string to initialize the swash with */
7199
7200    /* Elements 2 and 3 are either both present or both absent. [2] is
7201    * any inversion list generated at compile time; [3] indicates if
7202    * that inversion list has any user-defined properties in it. */
7203    if (av_len(av) >= 2) {
7204     invlist = ary[2];
7205     if (SvUV(ary[3])) {
7206      swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7207     }
7208    }
7209    else {
7210     invlist = NULL;
7211    }
7212
7213    /* Element [1] is reserved for the set-up swash.  If already there,
7214    * return it; if not, create it and store it there */
7215    if (SvROK(ary[1])) {
7216     sw = ary[1];
7217    }
7218    else if (si && doinit) {
7219
7220     sw = _core_swash_init("utf8", /* the utf8 package */
7221          "", /* nameless */
7222          si,
7223          1, /* binary */
7224          0, /* not from tr/// */
7225          invlist,
7226          &swash_init_flags);
7227     (void)av_store(av, 1, sw);
7228    }
7229   }
7230  }
7231
7232  if (listsvp) {
7233   SV* matches_string = newSVpvn("", 0);
7234
7235   /* Use the swash, if any, which has to have incorporated into it all
7236   * possibilities */
7237   if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7238    && (si && si != &PL_sv_undef))
7239   {
7240
7241    /* If no swash, use the input initialization string, if available */
7242    sv_catsv(matches_string, si);
7243   }
7244
7245   /* Add the inversion list to whatever we have.  This may have come from
7246   * the swash, or from an input parameter */
7247   if (invlist) {
7248    sv_catsv(matches_string, _invlist_contents(invlist));
7249   }
7250   *listsvp = matches_string;
7251  }
7252
7253  return sw;
7254 }
7255
7256 /*
7257  - reginclass - determine if a character falls into a character class
7258
7259   n is the ANYOF regnode
7260   p is the target string
7261   utf8_target tells whether p is in UTF-8.
7262
7263   Returns true if matched; false otherwise.
7264
7265   Note that this can be a synthetic start class, a combination of various
7266   nodes, so things you think might be mutually exclusive, such as locale,
7267   aren't.  It can match both locale and non-locale
7268
7269  */
7270
7271 STATIC bool
7272 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7273 {
7274  dVAR;
7275  const char flags = ANYOF_FLAGS(n);
7276  bool match = FALSE;
7277  UV c = *p;
7278
7279  PERL_ARGS_ASSERT_REGINCLASS;
7280
7281  /* If c is not already the code point, get it.  Note that
7282  * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7283  if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7284   STRLEN c_len = 0;
7285   c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7286     (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7287     | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7288     /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7289     * UTF8_ALLOW_FFFF */
7290   if (c_len == (STRLEN)-1)
7291    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7292  }
7293
7294  /* If this character is potentially in the bitmap, check it */
7295  if (c < 256) {
7296   if (ANYOF_BITMAP_TEST(n, c))
7297    match = TRUE;
7298   else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7299     && ! utf8_target
7300     && ! isASCII(c))
7301   {
7302    match = TRUE;
7303   }
7304   else if (flags & ANYOF_LOCALE) {
7305    RXp_MATCH_TAINTED_on(prog);
7306
7307    if ((flags & ANYOF_LOC_FOLD)
7308     && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7309    {
7310     match = TRUE;
7311    }
7312    else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7313
7314     /* The data structure is arranged so bits 0, 2, 4, ... are set
7315     * if the class includes the Posix character class given by
7316     * bit/2; and 1, 3, 5, ... are set if the class includes the
7317     * complemented Posix class given by int(bit/2).  So we loop
7318     * through the bits, each time changing whether we complement
7319     * the result or not.  Suppose for the sake of illustration
7320     * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7321     * is set, it means there is a match for this ANYOF node if the
7322     * character is in the class given by the expression (0 / 2 = 0
7323     * = \w).  If it is in that class, isFOO_lc() will return 1,
7324     * and since 'to_complement' is 0, the result will stay TRUE,
7325     * and we exit the loop.  Suppose instead that bit 0 is 0, but
7326     * bit 1 is 1.  That means there is a match if the character
7327     * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7328     * but will on bit 1.  On the second iteration 'to_complement'
7329     * will be 1, so the exclusive or will reverse things, so we
7330     * are testing for \W.  On the third iteration, 'to_complement'
7331     * will be 0, and we would be testing for \s; the fourth
7332     * iteration would test for \S, etc.
7333     *
7334     * Note that this code assumes that all the classes are closed
7335     * under folding.  For example, if a character matches \w, then
7336     * its fold does too; and vice versa.  This should be true for
7337     * any well-behaved locale for all the currently defined Posix
7338     * classes, except for :lower: and :upper:, which are handled
7339     * by the pseudo-class :cased: which matches if either of the
7340     * other two does.  To get rid of this assumption, an outer
7341     * loop could be used below to iterate over both the source
7342     * character, and its fold (if different) */
7343
7344     int count = 0;
7345     int to_complement = 0;
7346     while (count < ANYOF_MAX) {
7347      if (ANYOF_CLASS_TEST(n, count)
7348       && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7349      {
7350       match = TRUE;
7351       break;
7352      }
7353      count++;
7354      to_complement ^= 1;
7355     }
7356    }
7357   }
7358  }
7359
7360  /* If the bitmap didn't (or couldn't) match, and something outside the
7361  * bitmap could match, try that.  Locale nodes specify completely the
7362  * behavior of code points in the bit map (otherwise, a utf8 target would
7363  * cause them to be treated as Unicode and not locale), except in
7364  * the very unlikely event when this node is a synthetic start class, which
7365  * could be a combination of locale and non-locale nodes.  So allow locale
7366  * to match for the synthetic start class, which will give a false
7367  * positive that will be resolved when the match is done again as not part
7368  * of the synthetic start class */
7369  if (!match) {
7370   if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7371    match = TRUE; /* Everything above 255 matches */
7372   }
7373   else if (ANYOF_NONBITMAP(n)
7374     && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7375      || (utf8_target
7376       && (c >=256
7377        || (! (flags & ANYOF_LOCALE))
7378        || OP(n) == ANYOF_SYNTHETIC))))
7379   {
7380    SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7381    if (sw) {
7382     U8 * utf8_p;
7383     if (utf8_target) {
7384      utf8_p = (U8 *) p;
7385     } else { /* Convert to utf8 */
7386      STRLEN len = 1;
7387      utf8_p = bytes_to_utf8(p, &len);
7388     }
7389
7390     if (swash_fetch(sw, utf8_p, TRUE)) {
7391      match = TRUE;
7392     }
7393
7394     /* If we allocated a string above, free it */
7395     if (! utf8_target) Safefree(utf8_p);
7396    }
7397   }
7398
7399   if (UNICODE_IS_SUPER(c)
7400    && OP(n) == ANYOF_WARN_SUPER
7401    && ckWARN_d(WARN_NON_UNICODE))
7402   {
7403    Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7404     "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7405   }
7406  }
7407
7408  /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7409  return cBOOL(flags & ANYOF_INVERT) ^ match;
7410 }
7411
7412 STATIC U8 *
7413 S_reghop3(U8 *s, I32 off, const U8* lim)
7414 {
7415  /* return the position 'off' UTF-8 characters away from 's', forward if
7416  * 'off' >= 0, backwards if negative.  But don't go outside of position
7417  * 'lim', which better be < s  if off < 0 */
7418
7419  dVAR;
7420
7421  PERL_ARGS_ASSERT_REGHOP3;
7422
7423  if (off >= 0) {
7424   while (off-- && s < lim) {
7425    /* XXX could check well-formedness here */
7426    s += UTF8SKIP(s);
7427   }
7428  }
7429  else {
7430   while (off++ && s > lim) {
7431    s--;
7432    if (UTF8_IS_CONTINUED(*s)) {
7433     while (s > lim && UTF8_IS_CONTINUATION(*s))
7434      s--;
7435    }
7436    /* XXX could check well-formedness here */
7437   }
7438  }
7439  return s;
7440 }
7441
7442 #ifdef XXX_dmq
7443 /* there are a bunch of places where we use two reghop3's that should
7444    be replaced with this routine. but since thats not done yet
7445    we ifdef it out - dmq
7446 */
7447 STATIC U8 *
7448 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7449 {
7450  dVAR;
7451
7452  PERL_ARGS_ASSERT_REGHOP4;
7453
7454  if (off >= 0) {
7455   while (off-- && s < rlim) {
7456    /* XXX could check well-formedness here */
7457    s += UTF8SKIP(s);
7458   }
7459  }
7460  else {
7461   while (off++ && s > llim) {
7462    s--;
7463    if (UTF8_IS_CONTINUED(*s)) {
7464     while (s > llim && UTF8_IS_CONTINUATION(*s))
7465      s--;
7466    }
7467    /* XXX could check well-formedness here */
7468   }
7469  }
7470  return s;
7471 }
7472 #endif
7473
7474 STATIC U8 *
7475 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7476 {
7477  dVAR;
7478
7479  PERL_ARGS_ASSERT_REGHOPMAYBE3;
7480
7481  if (off >= 0) {
7482   while (off-- && s < lim) {
7483    /* XXX could check well-formedness here */
7484    s += UTF8SKIP(s);
7485   }
7486   if (off >= 0)
7487    return NULL;
7488  }
7489  else {
7490   while (off++ && s > lim) {
7491    s--;
7492    if (UTF8_IS_CONTINUED(*s)) {
7493     while (s > lim && UTF8_IS_CONTINUATION(*s))
7494      s--;
7495    }
7496    /* XXX could check well-formedness here */
7497   }
7498   if (off <= 0)
7499    return NULL;
7500  }
7501  return s;
7502 }
7503
7504
7505 /* when executing a regex that may have (?{}), extra stuff needs setting
7506    up that will be visible to the called code, even before the current
7507    match has finished. In particular:
7508
7509    * $_ is localised to the SV currently being matched;
7510    * pos($_) is created if necessary, ready to be updated on each call-out
7511  to code;
7512    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7513  isn't set until the current pattern is successfully finished), so that
7514  $1 etc of the match-so-far can be seen;
7515    * save the old values of subbeg etc of the current regex, and  set then
7516  to the current string (again, this is normally only done at the end
7517  of execution)
7518 */
7519
7520 static void
7521 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7522 {
7523  MAGIC *mg;
7524  regexp *const rex = ReANY(reginfo->prog);
7525  regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7526
7527  eval_state->rex = rex;
7528
7529  if (reginfo->sv) {
7530   /* Make $_ available to executed code. */
7531   if (reginfo->sv != DEFSV) {
7532    SAVE_DEFSV;
7533    DEFSV_set(reginfo->sv);
7534   }
7535
7536   if (!(mg = mg_find_mglob(reginfo->sv))) {
7537    /* prepare for quick setting of pos */
7538    mg = sv_magicext_mglob(reginfo->sv);
7539    mg->mg_len = -1;
7540   }
7541   eval_state->pos_magic = mg;
7542   eval_state->pos       = mg->mg_len;
7543  }
7544  else
7545   eval_state->pos_magic = NULL;
7546
7547  if (!PL_reg_curpm) {
7548   /* PL_reg_curpm is a fake PMOP that we can attach the current
7549   * regex to and point PL_curpm at, so that $1 et al are visible
7550   * within a /(?{})/. It's just allocated once per interpreter the
7551   * first time its needed */
7552   Newxz(PL_reg_curpm, 1, PMOP);
7553 #ifdef USE_ITHREADS
7554   {
7555    SV* const repointer = &PL_sv_undef;
7556    /* this regexp is also owned by the new PL_reg_curpm, which
7557    will try to free it.  */
7558    av_push(PL_regex_padav, repointer);
7559    PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7560    PL_regex_pad = AvARRAY(PL_regex_padav);
7561   }
7562 #endif
7563  }
7564  SET_reg_curpm(reginfo->prog);
7565  eval_state->curpm = PL_curpm;
7566  PL_curpm = PL_reg_curpm;
7567  if (RXp_MATCH_COPIED(rex)) {
7568   /*  Here is a serious problem: we cannot rewrite subbeg,
7569    since it may be needed if this match fails.  Thus
7570    $` inside (?{}) could fail... */
7571   eval_state->subbeg     = rex->subbeg;
7572   eval_state->sublen     = rex->sublen;
7573   eval_state->suboffset  = rex->suboffset;
7574   eval_state->subcoffset = rex->subcoffset;
7575 #ifdef PERL_ANY_COW
7576   eval_state->saved_copy = rex->saved_copy;
7577 #endif
7578   RXp_MATCH_COPIED_off(rex);
7579  }
7580  else
7581   eval_state->subbeg = NULL;
7582  rex->subbeg = (char *)reginfo->strbeg;
7583  rex->suboffset = 0;
7584  rex->subcoffset = 0;
7585  rex->sublen = reginfo->strend - reginfo->strbeg;
7586 }
7587
7588
7589 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7590
7591 static void
7592 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7593 {
7594  dVAR;
7595  regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7596  regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
7597  regmatch_slab *s;
7598
7599  Safefree(aux->poscache);
7600
7601  if (eval_state) {
7602
7603   /* undo the effects of S_setup_eval_state() */
7604
7605   if (eval_state->subbeg) {
7606    regexp * const rex = eval_state->rex;
7607    rex->subbeg     = eval_state->subbeg;
7608    rex->sublen     = eval_state->sublen;
7609    rex->suboffset  = eval_state->suboffset;
7610    rex->subcoffset = eval_state->subcoffset;
7611 #ifdef PERL_ANY_COW
7612    rex->saved_copy = eval_state->saved_copy;
7613 #endif
7614    RXp_MATCH_COPIED_on(rex);
7615   }
7616   if (eval_state->pos_magic)
7617    eval_state->pos_magic->mg_len = eval_state->pos;
7618
7619   PL_curpm = eval_state->curpm;
7620  }
7621
7622  PL_regmatch_state = aux->old_regmatch_state;
7623  PL_regmatch_slab  = aux->old_regmatch_slab;
7624
7625  /* free all slabs above current one - this must be the last action
7626  * of this function, as aux and eval_state are allocated within
7627  * slabs and may be freed here */
7628
7629  s = PL_regmatch_slab->next;
7630  if (s) {
7631   PL_regmatch_slab->next = NULL;
7632   while (s) {
7633    regmatch_slab * const osl = s;
7634    s = s->next;
7635    Safefree(osl);
7636   }
7637  }
7638 }
7639
7640
7641 STATIC void
7642 S_to_utf8_substr(pTHX_ regexp *prog)
7643 {
7644  /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7645  * on the converted value */
7646
7647  int i = 1;
7648
7649  PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7650
7651  do {
7652   if (prog->substrs->data[i].substr
7653    && !prog->substrs->data[i].utf8_substr) {
7654    SV* const sv = newSVsv(prog->substrs->data[i].substr);
7655    prog->substrs->data[i].utf8_substr = sv;
7656    sv_utf8_upgrade(sv);
7657    if (SvVALID(prog->substrs->data[i].substr)) {
7658     if (SvTAIL(prog->substrs->data[i].substr)) {
7659      /* Trim the trailing \n that fbm_compile added last
7660      time.  */
7661      SvCUR_set(sv, SvCUR(sv) - 1);
7662      /* Whilst this makes the SV technically "invalid" (as its
7663      buffer is no longer followed by "\0") when fbm_compile()
7664      adds the "\n" back, a "\0" is restored.  */
7665      fbm_compile(sv, FBMcf_TAIL);
7666     } else
7667      fbm_compile(sv, 0);
7668    }
7669    if (prog->substrs->data[i].substr == prog->check_substr)
7670     prog->check_utf8 = sv;
7671   }
7672  } while (i--);
7673 }
7674
7675 STATIC bool
7676 S_to_byte_substr(pTHX_ regexp *prog)
7677 {
7678  /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7679  * on the converted value; returns FALSE if can't be converted. */
7680
7681  dVAR;
7682  int i = 1;
7683
7684  PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7685
7686  do {
7687   if (prog->substrs->data[i].utf8_substr
7688    && !prog->substrs->data[i].substr) {
7689    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7690    if (! sv_utf8_downgrade(sv, TRUE)) {
7691     return FALSE;
7692    }
7693    if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7694     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7695      /* Trim the trailing \n that fbm_compile added last
7696       time.  */
7697      SvCUR_set(sv, SvCUR(sv) - 1);
7698      fbm_compile(sv, FBMcf_TAIL);
7699     } else
7700      fbm_compile(sv, 0);
7701    }
7702    prog->substrs->data[i].substr = sv;
7703    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7704     prog->check_substr = sv;
7705   }
7706  } while (i--);
7707
7708  return TRUE;
7709 }
7710
7711 /*
7712  * Local variables:
7713  * c-indentation-style: bsd
7714  * c-basic-offset: 4
7715  * indent-tabs-mode: nil
7716  * End:
7717  *
7718  * ex: set ts=8 sts=4 sw=4 et:
7719  */