src/5019001/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *  One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /* At least one required character in the target string is expressible only in
  41  * UTF-8. */
  42 static const char* const non_utf8_target_but_utf8_required
  43     = "Can't match, because target string needs to be in UTF-8\n";
  44
  45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  46  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  47  goto target; \
  48 } STMT_END
  49
  50 /*
  51  * pregcomp and pregexec -- regsub and regerror are not used in perl
  52  *
  53  * Copyright (c) 1986 by University of Toronto.
  54  * Written by Henry Spencer.  Not derived from licensed software.
  55  *
  56  * Permission is granted to anyone to use this software for any
  57  * purpose on any computer system, and to redistribute it freely,
  58  * subject to the following restrictions:
  59  *
  60  * 1. The author is not responsible for the consequences of use of
  61  *  this software, no matter how awful, even if they arise
  62  *  from defects in it.
  63  *
  64  * 2. The origin of this software must not be misrepresented, either
  65  *  by explicit claim or by omission.
  66  *
  67  * 3. Altered versions must be plainly marked as such, and must not
  68  *  be misrepresented as being the original software.
  69  *
  70  ****    Alterations to Henry's code are...
  71  ****
  72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  73  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  74  ****    by Larry Wall and others
  75  ****
  76  ****    You may distribute under the terms of either the GNU General Public
  77  ****    License or the Artistic License, as specified in the README file.
  78  *
  79  * Beware that some of this code is subtly aware of the way operator
  80  * precedence is structured in regular expressions.  Serious changes in
  81  * regular-expression syntax might require a total rethink.
  82  */
  83 #include "EXTERN.h"
  84 #define PERL_IN_REGEXEC_C
  85 #include "perl.h"
  86 #include "re_defs.h"
  87
  88 #ifdef PERL_IN_XSUB_RE
  89 #  include "re_comp.h"
  90 #else
  91 #  include "regcomp.h"
  92 #endif
  93
  94 #include "inline_invlist.c"
  95 #include "unicode_constants.h"
  96
  97 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  98
  99 #ifndef STATIC
 100 #define STATIC static
 101 #endif
 102
 103 /* Valid for non-utf8 strings: avoids the reginclass
 104  * call if there are no complications: i.e., if everything matchable is
 105  * straight forward in the bitmap */
 106 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0)   \
 107            : ANYOF_BITMAP_TEST(p,*(c)))
 108
 109 /*
 110  * Forwards.
 111  */
 112
 113 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 114 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 115
 116 #define HOPc(pos,off) \
 117   (char *)(reginfo->is_utf8_target \
 118    ? reghop3((U8*)pos, off, \
 119      (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 120    : (U8*)(pos + off))
 121 #define HOPBACKc(pos, off) \
 122   (char*)(reginfo->is_utf8_target \
 123    ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 124    : (pos - off >= reginfo->strbeg) \
 125     ? (U8*)pos - off  \
 126     : NULL)
 127
 128 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 129 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 130
 131
 132 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 133 #define NEXTCHR_IS_EOS (nextchr < 0)
 134
 135 #define SET_nextchr \
 136  nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 137
 138 #define SET_locinput(p) \
 139  locinput = (p);  \
 140  SET_nextchr
 141
 142
 143 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name) STMT_START {            \
 144   if (!swash_ptr) {                                                     \
 145    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 146    swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 147           1, 0, NULL, &flags);                 \
 148    assert(swash_ptr);                                                \
 149   }                                                                     \
 150  } STMT_END
 151
 152 /* If in debug mode, we test that a known character properly matches */
 153 #ifdef DEBUGGING
 154 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 155           property_name,                      \
 156           utf8_char_in_property)              \
 157   LOAD_UTF8_CHARCLASS(swash_ptr, property_name);                        \
 158   assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 159 #else
 160 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 161           property_name,                      \
 162           utf8_char_in_property)              \
 163   LOAD_UTF8_CHARCLASS(swash_ptr, property_name)
 164 #endif
 165
 166 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 167           PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 168           swash_property_names[_CC_WORDCHAR],   \
 169           GREEK_SMALL_LETTER_IOTA_UTF8)
 170
 171 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 172  STMT_START {                                                              \
 173   LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 174          "_X_regular_begin",                    \
 175          GREEK_SMALL_LETTER_IOTA_UTF8);         \
 176   LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 177          "_X_extend",                           \
 178          COMBINING_GRAVE_ACCENT_UTF8);          \
 179  } STMT_END
 180
 181 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
 182 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 183
 184 /* for use after a quantifier and before an EXACT-like node -- japhy */
 185 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 186  *
 187  * NOTE that *nothing* that affects backtracking should be in here, specifically
 188  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 189  * node that is in between two EXACT like nodes when ascertaining what the required
 190  * "follow" character is. This should probably be moved to regex compile time
 191  * although it may be done at run time beause of the REF possibility - more
 192  * investigation required. -- demerphq
 193 */
 194 #define JUMPABLE(rn) (      \
 195  OP(rn) == OPEN ||       \
 196  (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 197  OP(rn) == EVAL ||   \
 198  OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 199  OP(rn) == PLUS || OP(rn) == MINMOD || \
 200  OP(rn) == KEEPS || \
 201  (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 202 )
 203 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 204
 205 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 206
 207 #if 0
 208 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 209    we don't need this definition. */
 210 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 211 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 212 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 213
 214 #else
 215 /* ... so we use this as its faster. */
 216 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 217 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 218 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 219 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 220
 221 #endif
 222
 223 /*
 224   Search for mandatory following text node; for lookahead, the text must
 225   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 226 */
 227 #define FIND_NEXT_IMPT(rn) STMT_START { \
 228  while (JUMPABLE(rn)) { \
 229   const OPCODE type = OP(rn); \
 230   if (type == SUSPEND || PL_regkind[type] == CURLY) \
 231    rn = NEXTOPER(NEXTOPER(rn)); \
 232   else if (type == PLUS) \
 233    rn = NEXTOPER(rn); \
 234   else if (type == IFMATCH) \
 235    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 236   else rn += NEXT_OFF(rn); \
 237  } \
 238 } STMT_END
 239
 240 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 241  * These are for the pre-composed Hangul syllables, which are all in a
 242  * contiguous block and arranged there in such a way so as to facilitate
 243  * alorithmic determination of their characteristics.  As such, they don't need
 244  * a swash, but can be determined by simple arithmetic.  Almost all are
 245  * GCB=LVT, but every 28th one is a GCB=LV */
 246 #define SBASE 0xAC00    /* Start of block */
 247 #define SCount 11172    /* Length of block */
 248 #define TCount 28
 249
 250 #define SLAB_FIRST(s) (&(s)->states[0])
 251 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 252
 253 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 254 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 255 static regmatch_state * S_push_slab(pTHX);
 256
 257 #define REGCP_PAREN_ELEMS 3
 258 #define REGCP_OTHER_ELEMS 3
 259 #define REGCP_FRAME_ELEMS 1
 260 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 261  * are needed for the regexp context stack bookkeeping. */
 262
 263 STATIC CHECKPOINT
 264 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 265 {
 266  dVAR;
 267  const int retval = PL_savestack_ix;
 268  const int paren_elems_to_push =
 269     (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 270  const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 271  const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 272  I32 p;
 273  GET_RE_DEBUG_FLAGS_DECL;
 274
 275  PERL_ARGS_ASSERT_REGCPPUSH;
 276
 277  if (paren_elems_to_push < 0)
 278   Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
 279     paren_elems_to_push);
 280
 281  if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 282   Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 283     " out of range (%lu-%ld)",
 284     total_elems,
 285     (unsigned long)maxopenparen,
 286     (long)parenfloor);
 287
 288  SSGROW(total_elems + REGCP_FRAME_ELEMS);
 289
 290  DEBUG_BUFFERS_r(
 291   if ((int)maxopenparen > (int)parenfloor)
 292    PerlIO_printf(Perl_debug_log,
 293     "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 294     PTR2UV(rex),
 295     PTR2UV(rex->offs)
 296    );
 297  );
 298  for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 299 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 300   SSPUSHINT(rex->offs[p].end);
 301   SSPUSHINT(rex->offs[p].start);
 302   SSPUSHINT(rex->offs[p].start_tmp);
 303   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 304    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 305    (UV)p,
 306    (IV)rex->offs[p].start,
 307    (IV)rex->offs[p].start_tmp,
 308    (IV)rex->offs[p].end
 309   ));
 310  }
 311 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 312  SSPUSHINT(maxopenparen);
 313  SSPUSHINT(rex->lastparen);
 314  SSPUSHINT(rex->lastcloseparen);
 315  SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 316
 317  return retval;
 318 }
 319
 320 /* These are needed since we do not localize EVAL nodes: */
 321 #define REGCP_SET(cp)                                           \
 322  DEBUG_STATE_r(                                              \
 323    PerlIO_printf(Perl_debug_log,          \
 324     "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 325     (IV)PL_savestack_ix));                          \
 326  cp = PL_savestack_ix
 327
 328 #define REGCP_UNWIND(cp)                                        \
 329  DEBUG_STATE_r(                                              \
 330   if (cp != PL_savestack_ix)                   \
 331     PerlIO_printf(Perl_debug_log,          \
 332     "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 333     (IV)(cp), (IV)PL_savestack_ix));                \
 334  regcpblow(cp)
 335
 336 #define UNWIND_PAREN(lp, lcp)               \
 337  for (n = rex->lastparen; n > lp; n--)   \
 338   rex->offs[n].end = -1;              \
 339  rex->lastparen = n;                     \
 340  rex->lastcloseparen = lcp;
 341
 342
 343 STATIC void
 344 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 345 {
 346  dVAR;
 347  UV i;
 348  U32 paren;
 349  GET_RE_DEBUG_FLAGS_DECL;
 350
 351  PERL_ARGS_ASSERT_REGCPPOP;
 352
 353  /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 354  i = SSPOPUV;
 355  assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 356  i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 357  rex->lastcloseparen = SSPOPINT;
 358  rex->lastparen = SSPOPINT;
 359  *maxopenparen_p = SSPOPINT;
 360
 361  i -= REGCP_OTHER_ELEMS;
 362  /* Now restore the parentheses context. */
 363  DEBUG_BUFFERS_r(
 364   if (i || rex->lastparen + 1 <= rex->nparens)
 365    PerlIO_printf(Perl_debug_log,
 366     "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 367     PTR2UV(rex),
 368     PTR2UV(rex->offs)
 369    );
 370  );
 371  paren = *maxopenparen_p;
 372  for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 373   I32 tmps;
 374   rex->offs[paren].start_tmp = SSPOPINT;
 375   rex->offs[paren].start = SSPOPINT;
 376   tmps = SSPOPINT;
 377   if (paren <= rex->lastparen)
 378    rex->offs[paren].end = tmps;
 379   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 380    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 381    (UV)paren,
 382    (IV)rex->offs[paren].start,
 383    (IV)rex->offs[paren].start_tmp,
 384    (IV)rex->offs[paren].end,
 385    (paren > rex->lastparen ? "(skipped)" : ""));
 386   );
 387   paren--;
 388  }
 389 #if 1
 390  /* It would seem that the similar code in regtry()
 391  * already takes care of this, and in fact it is in
 392  * a better location to since this code can #if 0-ed out
 393  * but the code in regtry() is needed or otherwise tests
 394  * requiring null fields (pat.t#187 and split.t#{13,14}
 395  * (as of patchlevel 7877)  will fail.  Then again,
 396  * this code seems to be necessary or otherwise
 397  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 398  * --jhi updated by dapm */
 399  for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 400   if (i > *maxopenparen_p)
 401    rex->offs[i].start = -1;
 402   rex->offs[i].end = -1;
 403   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 404    "    \\%"UVuf": %s   ..-1 undeffing\n",
 405    (UV)i,
 406    (i > *maxopenparen_p) ? "-1" : "  "
 407   ));
 408  }
 409 #endif
 410 }
 411
 412 /* restore the parens and associated vars at savestack position ix,
 413  * but without popping the stack */
 414
 415 STATIC void
 416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 417 {
 418  I32 tmpix = PL_savestack_ix;
 419  PL_savestack_ix = ix;
 420  regcppop(rex, maxopenparen_p);
 421  PL_savestack_ix = tmpix;
 422 }
 423
 424 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
 425
 426 STATIC bool
 427 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 428 {
 429  /* Returns a boolean as to whether or not 'character' is a member of the
 430  * Posix character class given by 'classnum' that should be equivalent to a
 431  * value in the typedef '_char_class_number'.
 432  *
 433  * Ideally this could be replaced by a just an array of function pointers
 434  * to the C library functions that implement the macros this calls.
 435  * However, to compile, the precise function signatures are required, and
 436  * these may vary from platform to to platform.  To avoid having to figure
 437  * out what those all are on each platform, I (khw) am using this method,
 438  * which adds an extra layer of function call overhead (unless the C
 439  * optimizer strips it away).  But we don't particularly care about
 440  * performance with locales anyway. */
 441
 442  switch ((_char_class_number) classnum) {
 443   case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 444   case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 445   case _CC_ENUM_ASCII:     return isASCII_LC(character);
 446   case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 447   case _CC_ENUM_CASED:     return isLOWER_LC(character)
 448           || isUPPER_LC(character);
 449   case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 450   case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 451   case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 452   case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 453   case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 454   case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 455   case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 456   case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 457   case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 458   case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 459   case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 460   default:    /* VERTSPACE should never occur in locales */
 461    Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 462  }
 463
 464  assert(0); /* NOTREACHED */
 465  return FALSE;
 466 }
 467
 468 STATIC bool
 469 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 470 {
 471  /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 472  * 'character' is a member of the Posix character class given by 'classnum'
 473  * that should be equivalent to a value in the typedef
 474  * '_char_class_number'.
 475  *
 476  * This just calls isFOO_lc on the code point for the character if it is in
 477  * the range 0-255.  Outside that range, all characters avoid Unicode
 478  * rules, ignoring any locale.  So use the Unicode function if this class
 479  * requires a swash, and use the Unicode macro otherwise. */
 480
 481  PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 482
 483  if (UTF8_IS_INVARIANT(*character)) {
 484   return isFOO_lc(classnum, *character);
 485  }
 486  else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 487   return isFOO_lc(classnum,
 488       TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1)));
 489  }
 490
 491  if (classnum < _FIRST_NON_SWASH_CC) {
 492
 493   /* Initialize the swash unless done already */
 494   if (! PL_utf8_swash_ptrs[classnum]) {
 495    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 496    PL_utf8_swash_ptrs[classnum] = _core_swash_init("utf8",
 497     swash_property_names[classnum], &PL_sv_undef, 1, 0, NULL, &flags);
 498   }
 499
 500   return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 501         character,
 502         TRUE /* is UTF */ ));
 503  }
 504
 505  switch ((_char_class_number) classnum) {
 506   case _CC_ENUM_SPACE:
 507   case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 508
 509   case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 510   case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 511   case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 512   default:                 return 0;  /* Things like CNTRL are always
 513            below 256 */
 514  }
 515
 516  assert(0); /* NOTREACHED */
 517  return FALSE;
 518 }
 519
 520 /*
 521  * pregexec and friends
 522  */
 523
 524 #ifndef PERL_IN_XSUB_RE
 525 /*
 526  - pregexec - match a regexp against a string
 527  */
 528 I32
 529 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 530   char *strbeg, I32 minend, SV *screamer, U32 nosave)
 531 /* stringarg: the point in the string at which to begin matching */
 532 /* strend:    pointer to null at end of string */
 533 /* strbeg:    real beginning of string */
 534 /* minend:    end of match must be >= minend bytes after stringarg. */
 535 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 536  *            itself is accessed via the pointers above */
 537 /* nosave:    For optimizations. */
 538 {
 539  PERL_ARGS_ASSERT_PREGEXEC;
 540
 541  return
 542   regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 543      nosave ? 0 : REXEC_COPY_STR);
 544 }
 545 #endif
 546
 547 /*
 548  * Need to implement the following flags for reg_anch:
 549  *
 550  * USE_INTUIT_NOML  - Useful to call re_intuit_start() first
 551  * USE_INTUIT_ML
 552  * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
 553  * INTUIT_AUTORITATIVE_ML
 554  * INTUIT_ONCE_NOML  - Intuit can match in one location only.
 555  * INTUIT_ONCE_ML
 556  *
 557  * Another flag for this function: SECOND_TIME (so that float substrs
 558  * with giant delta may be not rechecked).
 559  */
 560
 561 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 562
 563 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 564    Otherwise, only SvCUR(sv) is used to get strbeg. */
 565
 566 /* XXXX We assume that strpos is strbeg unless sv. */
 567
 568 /* XXXX Some places assume that there is a fixed substring.
 569   An update may be needed if optimizer marks as "INTUITable"
 570   RExen without fixed substrings.  Similarly, it is assumed that
 571   lengths of all the strings are no more than minlen, thus they
 572   cannot come from lookahead.
 573   (Or minlen should take into account lookahead.)
 574   NOTE: Some of this comment is not correct. minlen does now take account
 575   of lookahead/behind. Further research is required. -- demerphq
 576
 577 */
 578
 579 /* A failure to find a constant substring means that there is no need to make
 580    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 581    finding a substring too deep into the string means that fewer calls to
 582    regtry() should be needed.
 583
 584    REx compiler's optimizer found 4 possible hints:
 585   a) Anchored substring;
 586   b) Fixed substring;
 587   c) Whether we are anchored (beginning-of-line or \G);
 588   d) First node (of those at offset 0) which may distinguish positions;
 589    We use a)b)d) and multiline-part of c), and try to find a position in the
 590    string which does not contradict any of them.
 591  */
 592
 593 /* Most of decisions we do here should have been done at compile time.
 594    The nodes of the REx which we used for the search should have been
 595    deleted from the finite automaton. */
 596
 597 /* args:
 598  * rx:     the regex to match against
 599  * sv:     the SV being matched: only used for utf8 flag; the string
 600  *         itself is accessed via the pointers below. Note that on
 601  *         something like an overloaded SV, SvPOK(sv) may be false
 602  *         and the string pointers may point to something unrelated to
 603  *         the SV itself.
 604  * strbeg: real beginning of string
 605  * strpos: the point in the string at which to begin matching
 606  * strend: pointer to the byte following the last char of the string
 607  * flags   currently unused; set to 0
 608  * data:   currently unused; set to NULL
 609  */
 610
 611 char *
 612 Perl_re_intuit_start(pTHX_
 613      REGEXP * const rx,
 614      SV *sv,
 615      const char * const strbeg,
 616      char *strpos,
 617      char *strend,
 618      const U32 flags,
 619      re_scream_pos_data *data)
 620 {
 621  dVAR;
 622  struct regexp *const prog = ReANY(rx);
 623  I32 start_shift = 0;
 624  /* Should be nonnegative! */
 625  I32 end_shift   = 0;
 626  char *s;
 627  SV *check;
 628  char *t;
 629  const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 630  I32 ml_anch;
 631  char *other_last = NULL; /* other substr checked before this */
 632  char *check_at = NULL;  /* check substr found at this pos */
 633  char *checked_upto = NULL;          /* how far into the string we have already checked using find_byclass*/
 634  const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 635  RXi_GET_DECL(prog,progi);
 636  regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 637  regmatch_info *const reginfo = &reginfo_buf;
 638 #ifdef DEBUGGING
 639  const char * const i_strpos = strpos;
 640 #endif
 641  GET_RE_DEBUG_FLAGS_DECL;
 642
 643  PERL_ARGS_ASSERT_RE_INTUIT_START;
 644  PERL_UNUSED_ARG(flags);
 645  PERL_UNUSED_ARG(data);
 646
 647  /* CHR_DIST() would be more correct here but it makes things slow. */
 648  if (prog->minlen > strend - strpos) {
 649   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 650        "String too short... [re_intuit_start]\n"));
 651   goto fail;
 652  }
 653
 654  reginfo->is_utf8_target = cBOOL(utf8_target);
 655  reginfo->info_aux = NULL;
 656  reginfo->strbeg = strbeg;
 657  reginfo->strend = strend;
 658  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 659  reginfo->intuit = 1;
 660  /* not actually used within intuit, but zero for safety anyway */
 661  reginfo->poscache_maxiter = 0;
 662
 663  if (utf8_target) {
 664   if (!prog->check_utf8 && prog->check_substr)
 665    to_utf8_substr(prog);
 666   check = prog->check_utf8;
 667  } else {
 668   if (!prog->check_substr && prog->check_utf8) {
 669    if (! to_byte_substr(prog)) {
 670     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 671    }
 672   }
 673   check = prog->check_substr;
 674  }
 675  if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
 676   ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 677      || ( (prog->extflags & RXf_ANCH_BOL)
 678       && !multiline ) ); /* Check after \n? */
 679
 680   if (!ml_anch) {
 681   if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 682     && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 683    && (strpos != strbeg)) {
 684    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 685    goto fail;
 686   }
 687   if (prog->check_offset_min == prog->check_offset_max
 688    && !(prog->extflags & RXf_CANY_SEEN)
 689    && ! multiline)   /* /m can cause \n's to match that aren't
 690         accounted for in the string max length.
 691         See [perl #115242] */
 692   {
 693    /* Substring at constant offset from beg-of-str... */
 694    I32 slen;
 695
 696    s = HOP3c(strpos, prog->check_offset_min, strend);
 697
 698    if (SvTAIL(check)) {
 699     slen = SvCUR(check); /* >= 1 */
 700
 701     if ( strend - s > slen || strend - s < slen - 1
 702      || (strend - s == slen && strend[-1] != '\n')) {
 703      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 704      goto fail_finish;
 705     }
 706     /* Now should match s[0..slen-2] */
 707     slen--;
 708     if (slen && (*SvPVX_const(check) != *s
 709        || (slen > 1
 710         && memNE(SvPVX_const(check), s, slen)))) {
 711     report_neq:
 712      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 713      goto fail_finish;
 714     }
 715    }
 716    else if (*SvPVX_const(check) != *s
 717      || ((slen = SvCUR(check)) > 1
 718       && memNE(SvPVX_const(check), s, slen)))
 719     goto report_neq;
 720    check_at = s;
 721    goto success_at_start;
 722   }
 723   }
 724   /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 725   s = strpos;
 726   start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 727   end_shift = prog->check_end_shift;
 728
 729   if (!ml_anch) {
 730    const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 731           - (SvTAIL(check) != 0);
 732    const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 733
 734    if (end_shift < eshift)
 735     end_shift = eshift;
 736   }
 737  }
 738  else {    /* Can match at random position */
 739   ml_anch = 0;
 740   s = strpos;
 741   start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 742   end_shift = prog->check_end_shift;
 743
 744   /* end shift should be non negative here */
 745  }
 746
 747 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
 748  if (end_shift < 0)
 749   Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 750     (IV)end_shift, RX_PRECOMP(prog));
 751 #endif
 752
 753   restart:
 754  /* Find a possible match in the region s..strend by looking for
 755  the "check" substring in the region corrected by start/end_shift. */
 756
 757  {
 758   I32 srch_start_shift = start_shift;
 759   I32 srch_end_shift = end_shift;
 760   U8* start_point;
 761   U8* end_point;
 762   if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 763    srch_end_shift -= ((strbeg - s) - srch_start_shift);
 764    srch_start_shift = strbeg - s;
 765   }
 766  DEBUG_OPTIMISE_MORE_r({
 767   PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 768    (IV)prog->check_offset_min,
 769    (IV)srch_start_shift,
 770    (IV)srch_end_shift,
 771    (IV)prog->check_end_shift);
 772  });
 773
 774   if (prog->extflags & RXf_CANY_SEEN) {
 775    start_point= (U8*)(s + srch_start_shift);
 776    end_point= (U8*)(strend - srch_end_shift);
 777   } else {
 778    start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 779    end_point= HOP3(strend, -srch_end_shift, strbeg);
 780   }
 781   DEBUG_OPTIMISE_MORE_r({
 782    PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 783     (int)(end_point - start_point),
 784     (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 785     start_point);
 786   });
 787
 788   s = fbm_instr( start_point, end_point,
 789      check, multiline ? FBMrf_MULTILINE : 0);
 790  }
 791  /* Update the count-of-usability, remove useless subpatterns,
 792   unshift s.  */
 793
 794  DEBUG_EXECUTE_r({
 795   RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 796    SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 797   PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 798       (s ? "Found" : "Did not find"),
 799    (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 800     ? "anchored" : "floating"),
 801    quoted,
 802    RE_SV_TAIL(check),
 803    (s ? " at offset " : "...\n") );
 804  });
 805
 806  if (!s)
 807   goto fail_finish;
 808  /* Finish the diagnostic message */
 809  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 810
 811  /* XXX dmq: first branch is for positive lookbehind...
 812  Our check string is offset from the beginning of the pattern.
 813  So we need to do any stclass tests offset forward from that
 814  point. I think. :-(
 815  */
 816
 817
 818
 819  check_at=s;
 820
 821
 822  /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 823  Start with the other substr.
 824  XXXX no SCREAM optimization yet - and a very coarse implementation
 825  XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 826     *always* match.  Probably should be marked during compile...
 827  Probably it is right to do no SCREAM here...
 828  */
 829
 830  if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
 831     : (prog->float_substr && prog->anchored_substr))
 832  {
 833   /* Take into account the "other" substring. */
 834   /* XXXX May be hopelessly wrong for UTF... */
 835   if (!other_last)
 836    other_last = strpos;
 837   if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
 838   do_other_anchored:
 839    {
 840     char * const last = HOP3c(s, -start_shift, strbeg);
 841     char *last1, *last2;
 842     char * const saved_s = s;
 843     SV* must;
 844
 845     t = s - prog->check_offset_max;
 846     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 847      && (!utf8_target
 848       || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 849        && t > strpos)))
 850      NOOP;
 851     else
 852      t = strpos;
 853     t = HOP3c(t, prog->anchored_offset, strend);
 854     if (t < other_last) /* These positions already checked */
 855      t = other_last;
 856     last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 857     if (last < last1)
 858      last1 = last;
 859     /* XXXX It is not documented what units *_offsets are in.
 860     We assume bytes, but this is clearly wrong.
 861     Meaning this code needs to be carefully reviewed for errors.
 862     dmq.
 863     */
 864
 865     /* On end-of-str: see comment below. */
 866     must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
 867     if (must == &PL_sv_undef) {
 868      s = (char*)NULL;
 869      DEBUG_r(must = prog->anchored_utf8); /* for debug */
 870     }
 871     else
 872      s = fbm_instr(
 873       (unsigned char*)t,
 874       HOP3(HOP3(last1, prog->anchored_offset, strend)
 875         + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 876       must,
 877       multiline ? FBMrf_MULTILINE : 0
 878      );
 879     DEBUG_EXECUTE_r({
 880      RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 881       SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 882      PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 883       (s ? "Found" : "Contradicts"),
 884       quoted, RE_SV_TAIL(must));
 885     });
 886
 887
 888     if (!s) {
 889      if (last1 >= last2) {
 890       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 891             ", giving up...\n"));
 892       goto fail_finish;
 893      }
 894      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 895       ", trying floating at offset %ld...\n",
 896       (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 897      other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 898      s = HOP3c(last, 1, strend);
 899      goto restart;
 900     }
 901     else {
 902      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 903       (long)(s - i_strpos)));
 904      t = HOP3c(s, -prog->anchored_offset, strbeg);
 905      other_last = HOP3c(s, 1, strend);
 906      s = saved_s;
 907      if (t == strpos)
 908       goto try_at_start;
 909      goto try_at_offset;
 910     }
 911    }
 912   }
 913   else {  /* Take into account the floating substring. */
 914    char *last, *last1;
 915    char * const saved_s = s;
 916    SV* must;
 917
 918    t = HOP3c(s, -start_shift, strbeg);
 919    last1 = last =
 920     HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 921    if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 922     last = HOP3c(t, prog->float_max_offset, strend);
 923    s = HOP3c(t, prog->float_min_offset, strend);
 924    if (s < other_last)
 925     s = other_last;
 926  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 927    must = utf8_target ? prog->float_utf8 : prog->float_substr;
 928    /* fbm_instr() takes into account exact value of end-of-str
 929    if the check is SvTAIL(ed).  Since false positives are OK,
 930    and end-of-str is not later than strend we are OK. */
 931    if (must == &PL_sv_undef) {
 932     s = (char*)NULL;
 933     DEBUG_r(must = prog->float_utf8); /* for debug message */
 934    }
 935    else
 936     s = fbm_instr((unsigned char*)s,
 937        (unsigned char*)last + SvCUR(must)
 938         - (SvTAIL(must)!=0),
 939        must, multiline ? FBMrf_MULTILINE : 0);
 940    DEBUG_EXECUTE_r({
 941     RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 942      SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 943     PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 944      (s ? "Found" : "Contradicts"),
 945      quoted, RE_SV_TAIL(must));
 946    });
 947    if (!s) {
 948     if (last1 == last) {
 949      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 950            ", giving up...\n"));
 951      goto fail_finish;
 952     }
 953     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 954      ", trying anchored starting at offset %ld...\n",
 955      (long)(saved_s + 1 - i_strpos)));
 956     other_last = last;
 957     s = HOP3c(t, 1, strend);
 958     goto restart;
 959    }
 960    else {
 961     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 962      (long)(s - i_strpos)));
 963     other_last = s; /* Fix this later. --Hugo */
 964     s = saved_s;
 965     if (t == strpos)
 966      goto try_at_start;
 967     goto try_at_offset;
 968    }
 969   }
 970  }
 971
 972
 973  t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 974
 975  DEBUG_OPTIMISE_MORE_r(
 976   PerlIO_printf(Perl_debug_log,
 977    "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 978    (IV)prog->check_offset_min,
 979    (IV)prog->check_offset_max,
 980    (IV)(s-strpos),
 981    (IV)(t-strpos),
 982    (IV)(t-s),
 983    (IV)(strend-strpos)
 984   )
 985  );
 986
 987  if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 988   && (!utf8_target
 989    || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 990     && t > strpos)))
 991  {
 992   /* Fixed substring is found far enough so that the match
 993   cannot start at strpos. */
 994  try_at_offset:
 995   if (ml_anch && t[-1] != '\n') {
 996    /* Eventually fbm_*() should handle this, but often
 997    anchored_offset is not 0, so this check will not be wasted. */
 998    /* XXXX In the code below we prefer to look for "^" even in
 999    presence of anchored substrings.  And we search even
1000    beyond the found float position.  These pessimizations
1001    are historical artefacts only.  */
1002   find_anchor:
1003    while (t < strend - prog->minlen) {
1004     if (*t == '\n') {
1005      if (t < check_at - prog->check_offset_min) {
1006       if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1007        /* Since we moved from the found position,
1008        we definitely contradict the found anchored
1009        substr.  Due to the above check we do not
1010        contradict "check" substr.
1011        Thus we can arrive here only if check substr
1012        is float.  Redo checking for "other"=="fixed".
1013        */
1014        strpos = t + 1;
1015        DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1016         PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1017        goto do_other_anchored;
1018       }
1019       /* We don't contradict the found floating substring. */
1020       /* XXXX Why not check for STCLASS? */
1021       s = t + 1;
1022       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1023        PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1024       goto set_useful;
1025      }
1026      /* Position contradicts check-string */
1027      /* XXXX probably better to look for check-string
1028      than for "\n", so one should lower the limit for t? */
1029      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1030       PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1031      other_last = strpos = s = t + 1;
1032      goto restart;
1033     }
1034     t++;
1035    }
1036    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1037       PL_colors[0], PL_colors[1]));
1038    goto fail_finish;
1039   }
1040   else {
1041    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1042       PL_colors[0], PL_colors[1]));
1043   }
1044   s = t;
1045  set_useful:
1046   ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1047  }
1048  else {
1049   /* The found string does not prohibit matching at strpos,
1050   - no optimization of calling REx engine can be performed,
1051   unless it was an MBOL and we are not after MBOL,
1052   or a future STCLASS check will fail this. */
1053  try_at_start:
1054   /* Even in this situation we may use MBOL flag if strpos is offset
1055   wrt the start of the string. */
1056   if (ml_anch && (strpos != strbeg) && strpos[-1] != '\n'
1057    /* May be due to an implicit anchor of m{.*foo}  */
1058    && !(prog->intflags & PREGf_IMPLICIT))
1059   {
1060    t = strpos;
1061    goto find_anchor;
1062   }
1063   DEBUG_EXECUTE_r( if (ml_anch)
1064    PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1065       (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1066   );
1067  success_at_start:
1068   if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1069    && (utf8_target ? (
1070     prog->check_utf8  /* Could be deleted already */
1071     && --BmUSEFUL(prog->check_utf8) < 0
1072     && (prog->check_utf8 == prog->float_utf8)
1073    ) : (
1074     prog->check_substr  /* Could be deleted already */
1075     && --BmUSEFUL(prog->check_substr) < 0
1076     && (prog->check_substr == prog->float_substr)
1077    )))
1078   {
1079    /* If flags & SOMETHING - do not do it many times on the same match */
1080    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1081    /* XXX Does the destruction order has to change with utf8_target? */
1082    SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1083    SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1084    prog->check_substr = prog->check_utf8 = NULL; /* disable */
1085    prog->float_substr = prog->float_utf8 = NULL; /* clear */
1086    check = NULL;   /* abort */
1087    s = strpos;
1088    /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1089      see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1090    if (prog->intflags & PREGf_IMPLICIT)
1091     prog->extflags &= ~RXf_ANCH_MBOL;
1092    /* XXXX This is a remnant of the old implementation.  It
1093      looks wasteful, since now INTUIT can use many
1094      other heuristics. */
1095    prog->extflags &= ~RXf_USE_INTUIT;
1096    /* XXXX What other flags might need to be cleared in this branch? */
1097   }
1098   else
1099    s = strpos;
1100  }
1101
1102  /* Last resort... */
1103  /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1104  /* trie stclasses are too expensive to use here, we are better off to
1105  leave it to regmatch itself */
1106  if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1107   /* minlen == 0 is possible if regstclass is \b or \B,
1108   and the fixed substr is ''$.
1109   Since minlen is already taken into account, s+1 is before strend;
1110   accidentally, minlen >= 1 guaranties no false positives at s + 1
1111   even for \b or \B.  But (minlen? 1 : 0) below assumes that
1112   regstclass does not come from lookahead...  */
1113   /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1114   This leaves EXACTF-ish only, which are dealt with in find_byclass().  */
1115   const U8* const str = (U8*)STRING(progi->regstclass);
1116   const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1117      ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1118      : 1);
1119   char * endpos;
1120   if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1121    endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1122   else if (prog->float_substr || prog->float_utf8)
1123    endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1124   else
1125    endpos= strend;
1126
1127   if (checked_upto < s)
1128   checked_upto = s;
1129   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1130          (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1131
1132   t = s;
1133   s = find_byclass(prog, progi->regstclass, checked_upto, endpos,
1134        reginfo);
1135   if (s) {
1136    checked_upto = s;
1137   } else {
1138 #ifdef DEBUGGING
1139    const char *what = NULL;
1140 #endif
1141    if (endpos == strend) {
1142     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1143         "Could not match STCLASS...\n") );
1144     goto fail;
1145    }
1146    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1147         "This position contradicts STCLASS...\n") );
1148    if ((prog->extflags & RXf_ANCH) && !ml_anch)
1149     goto fail;
1150    checked_upto = HOPBACKc(endpos, start_shift);
1151    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1152          (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1153    /* Contradict one of substrings */
1154    if (prog->anchored_substr || prog->anchored_utf8) {
1155     if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1156      DEBUG_EXECUTE_r( what = "anchored" );
1157     hop_and_restart:
1158      s = HOP3c(t, 1, strend);
1159      if (s + start_shift + end_shift > strend) {
1160       /* XXXX Should be taken into account earlier? */
1161       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1162            "Could not match STCLASS...\n") );
1163       goto fail;
1164      }
1165      if (!check)
1166       goto giveup;
1167      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1168         "Looking for %s substr starting at offset %ld...\n",
1169         what, (long)(s + start_shift - i_strpos)) );
1170      goto restart;
1171     }
1172     /* Have both, check_string is floating */
1173     if (t + start_shift >= check_at) /* Contradicts floating=check */
1174      goto retry_floating_check;
1175     /* Recheck anchored substring, but not floating... */
1176     s = check_at;
1177     if (!check)
1178      goto giveup;
1179     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1180       "Looking for anchored substr starting at offset %ld...\n",
1181       (long)(other_last - i_strpos)) );
1182     goto do_other_anchored;
1183    }
1184    /* Another way we could have checked stclass at the
1185    current position only: */
1186    if (ml_anch) {
1187     s = t = t + 1;
1188     if (!check)
1189      goto giveup;
1190     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1191       "Looking for /%s^%s/m starting at offset %ld...\n",
1192       PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1193     goto try_at_offset;
1194    }
1195    if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1196     goto fail;
1197    /* Check is floating substring. */
1198   retry_floating_check:
1199    t = check_at - start_shift;
1200    DEBUG_EXECUTE_r( what = "floating" );
1201    goto hop_and_restart;
1202   }
1203   if (t != s) {
1204    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1205       "By STCLASS: moving %ld --> %ld\n",
1206         (long)(t - i_strpos), (long)(s - i_strpos))
1207     );
1208   }
1209   else {
1210    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1211         "Does not contradict STCLASS...\n");
1212     );
1213   }
1214  }
1215   giveup:
1216  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1217       PL_colors[4], (check ? "Guessed" : "Giving up"),
1218       PL_colors[5], (long)(s - i_strpos)) );
1219  return s;
1220
1221   fail_finish:    /* Substring not found */
1222  if (prog->check_substr || prog->check_utf8)  /* could be removed already */
1223   BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1224   fail:
1225  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1226       PL_colors[4], PL_colors[5]));
1227  return NULL;
1228 }
1229
1230 #define DECL_TRIE_TYPE(scan) \
1231  const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1232      trie_type = ((scan->flags == EXACT) \
1233        ? (utf8_target ? trie_utf8 : trie_plain) \
1234        : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1235
1236 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1237 STMT_START {                               \
1238  STRLEN skiplen;                                                                 \
1239  switch (trie_type) {                                                            \
1240  case trie_utf8_fold:                                                            \
1241   if ( foldlen>0 ) {                                                          \
1242    uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1243    foldlen -= len;                                                         \
1244    uscan += len;                                                           \
1245    len=0;                                                                  \
1246   } else {                                                                    \
1247    uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );                \
1248    len = UTF8SKIP(uc);                                                     \
1249    skiplen = UNISKIP( uvc );                                               \
1250    foldlen -= skiplen;                                                     \
1251    uscan = foldbuf + skiplen;                                              \
1252   }                                                                           \
1253   break;                                                                      \
1254  case trie_latin_utf8_fold:                                                      \
1255   if ( foldlen>0 ) {                                                          \
1256    uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1257    foldlen -= len;                                                         \
1258    uscan += len;                                                           \
1259    len=0;                                                                  \
1260   } else {                                                                    \
1261    len = 1;                                                                \
1262    uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL);   \
1263    skiplen = UNISKIP( uvc );                                               \
1264    foldlen -= skiplen;                                                     \
1265    uscan = foldbuf + skiplen;                                              \
1266   }                                                                           \
1267   break;                                                                      \
1268  case trie_utf8:                                                                 \
1269   uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1270   break;                                                                      \
1271  case trie_plain:                                                                \
1272   uvc = (UV)*uc;                                                              \
1273   len = 1;                                                                    \
1274  }                                                                               \
1275  if (uvc < 256) {                                                                \
1276   charid = trie->charmap[ uvc ];                                              \
1277  }                                                                               \
1278  else {                                                                          \
1279   charid = 0;                                                                 \
1280   if (widecharmap) {                                                          \
1281    SV** const svpp = hv_fetch(widecharmap,                                 \
1282       (char*)&uvc, sizeof(UV), 0);                                \
1283    if (svpp)                                                               \
1284     charid = (U16)SvIV(*svpp);                                          \
1285   }                                                                           \
1286  }                                                                               \
1287 } STMT_END
1288
1289 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1290 STMT_START {                                              \
1291  while (s <= e) {                                      \
1292   if ( (CoNd)                                       \
1293    && (ln == 1 || folder(s, pat_string, ln))    \
1294    && (reginfo->intuit || regtry(reginfo, &s)) )\
1295    goto got_it;                                  \
1296   s++;                                              \
1297  }                                                     \
1298 } STMT_END
1299
1300 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1301 STMT_START {                                          \
1302  while (s < strend) {                              \
1303   CoDe                                          \
1304   s += UTF8SKIP(s);                             \
1305  }                                                 \
1306 } STMT_END
1307
1308 #define REXEC_FBC_SCAN(CoDe)                          \
1309 STMT_START {                                          \
1310  while (s < strend) {                              \
1311   CoDe                                          \
1312   s++;                                          \
1313  }                                                 \
1314 } STMT_END
1315
1316 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1317 REXEC_FBC_UTF8_SCAN(                                  \
1318  if (CoNd) {                                       \
1319   if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1320    goto got_it;                              \
1321   else                                          \
1322    tmp = doevery;                            \
1323  }                                                 \
1324  else                                              \
1325   tmp = 1;                                      \
1326 )
1327
1328 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1329 REXEC_FBC_SCAN(                                       \
1330  if (CoNd) {                                       \
1331   if (tmp && (reginfo->intuit || regtry(reginfo, &s)))  \
1332    goto got_it;                              \
1333   else                                          \
1334    tmp = doevery;                            \
1335  }                                                 \
1336  else                                              \
1337   tmp = 1;                                      \
1338 )
1339
1340 #define REXEC_FBC_TRYIT               \
1341 if ((reginfo->intuit || regtry(reginfo, &s))) \
1342  goto got_it
1343
1344 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1345  if (utf8_target) {                                             \
1346   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1347  }                                                          \
1348  else {                                                     \
1349   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1350  }
1351
1352 #define DUMP_EXEC_POS(li,s,doutf8) \
1353  dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1354     startpos, doutf8)
1355
1356
1357 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1358   tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1359   tmp = TEST_NON_UTF8(tmp);                                              \
1360   REXEC_FBC_UTF8_SCAN(                                                   \
1361    if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1362     tmp = !tmp;                                                    \
1363     IF_SUCCESS;                                                    \
1364    }                                                                  \
1365    else {                                                             \
1366     IF_FAIL;                                                       \
1367    }                                                                  \
1368   );                                                                     \
1369
1370 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1371   if (s == reginfo->strbeg) {                                            \
1372    tmp = '\n';                                                        \
1373   }                                                                      \
1374   else {                                                                 \
1375    U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);          \
1376    tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);       \
1377   }                                                                      \
1378   tmp = TeSt1_UtF8;                                                      \
1379   LOAD_UTF8_CHARCLASS_ALNUM();                                                                \
1380   REXEC_FBC_UTF8_SCAN(                                                   \
1381    if (tmp == ! (TeSt2_UtF8)) { \
1382     tmp = !tmp;                                                    \
1383     IF_SUCCESS;                                                    \
1384    }                                                                  \
1385    else {                                                             \
1386     IF_FAIL;                                                       \
1387    }                                                                  \
1388   );                                                                     \
1389
1390 /* The only difference between the BOUND and NBOUND cases is that
1391  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1392  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1393  * with the other one being empty */
1394 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1395  FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1396
1397 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1398  FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1399
1400 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1401  FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1402
1403 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1404  FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1405
1406
1407 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1408  * be passed in completely with the variable name being tested, which isn't
1409  * such a clean interface, but this is easier to read than it was before.  We
1410  * are looking for the boundary (or non-boundary between a word and non-word
1411  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1412  * must be different.  Find the "wordness" of the character just prior to this
1413  * one, and compare it with the wordness of this one.  If they differ, we have
1414  * a boundary.  At the beginning of the string, pretend that the previous
1415  * character was a new-line */
1416 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1417  if (utf8_target) {                                                         \
1418     UTF8_CODE \
1419  }                                                                          \
1420  else {  /* Not utf8 */                                                     \
1421   tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1422   tmp = TEST_NON_UTF8(tmp);                                              \
1423   REXEC_FBC_SCAN(                                                        \
1424    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1425     tmp = !tmp;                                                    \
1426     IF_SUCCESS;                                                    \
1427    }                                                                  \
1428    else {                                                             \
1429     IF_FAIL;                                                       \
1430    }                                                                  \
1431   );                                                                     \
1432  }                                                                          \
1433  if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s)))           \
1434   goto got_it;
1435
1436 /* We know what class REx starts with.  Try to find this position... */
1437 /* if reginfo->intuit, its a dryrun */
1438 /* annoyingly all the vars in this routine have different names from their counterparts
1439    in regmatch. /grrr */
1440
1441 STATIC char *
1442 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1443  const char *strend, regmatch_info *reginfo)
1444 {
1445  dVAR;
1446  const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1447  char *pat_string;   /* The pattern's exactish string */
1448  char *pat_end;     /* ptr to end char of pat_string */
1449  re_fold_t folder; /* Function for computing non-utf8 folds */
1450  const U8 *fold_array;   /* array for folding ords < 256 */
1451  STRLEN ln;
1452  STRLEN lnc;
1453  U8 c1;
1454  U8 c2;
1455  char *e;
1456  I32 tmp = 1; /* Scratch variable? */
1457  const bool utf8_target = reginfo->is_utf8_target;
1458  UV utf8_fold_flags = 0;
1459  const bool is_utf8_pat = reginfo->is_utf8_pat;
1460  bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1461         with a result inverts that result, as 0^1 =
1462         1 and 1^1 = 0 */
1463  _char_class_number classnum;
1464
1465  RXi_GET_DECL(prog,progi);
1466
1467  PERL_ARGS_ASSERT_FIND_BYCLASS;
1468
1469  /* We know what class it must start with. */
1470  switch (OP(c)) {
1471  case ANYOF:
1472  case ANYOF_SYNTHETIC:
1473  case ANYOF_WARN_SUPER:
1474   if (utf8_target) {
1475    REXEC_FBC_UTF8_CLASS_SCAN(
1476      reginclass(prog, c, (U8*)s, utf8_target));
1477   }
1478   else {
1479    REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1480   }
1481   break;
1482  case CANY:
1483   REXEC_FBC_SCAN(
1484    if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1485     goto got_it;
1486    else
1487     tmp = doevery;
1488   );
1489   break;
1490
1491  case EXACTFA:
1492   if (is_utf8_pat || utf8_target) {
1493    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1494    goto do_exactf_utf8;
1495   }
1496   fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1497   folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1498   goto do_exactf_non_utf8; /* isn't dealt with by these */
1499
1500  case EXACTF:
1501   if (utf8_target) {
1502
1503    /* regcomp.c already folded this if pattern is in UTF-8 */
1504    utf8_fold_flags = 0;
1505    goto do_exactf_utf8;
1506   }
1507   fold_array = PL_fold;
1508   folder = foldEQ;
1509   goto do_exactf_non_utf8;
1510
1511  case EXACTFL:
1512   if (is_utf8_pat || utf8_target) {
1513    utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1514    goto do_exactf_utf8;
1515   }
1516   fold_array = PL_fold_locale;
1517   folder = foldEQ_locale;
1518   goto do_exactf_non_utf8;
1519
1520  case EXACTFU_SS:
1521   if (is_utf8_pat) {
1522    utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1523   }
1524   goto do_exactf_utf8;
1525
1526  case EXACTFU_TRICKYFOLD:
1527  case EXACTFU:
1528   if (is_utf8_pat || utf8_target) {
1529    utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1530    goto do_exactf_utf8;
1531   }
1532
1533   /* Any 'ss' in the pattern should have been replaced by regcomp,
1534   * so we don't have to worry here about this single special case
1535   * in the Latin1 range */
1536   fold_array = PL_fold_latin1;
1537   folder = foldEQ_latin1;
1538
1539   /* FALL THROUGH */
1540
1541  do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1542       are no glitches with fold-length differences
1543       between the target string and pattern */
1544
1545   /* The idea in the non-utf8 EXACTF* cases is to first find the
1546   * first character of the EXACTF* node and then, if necessary,
1547   * case-insensitively compare the full text of the node.  c1 is the
1548   * first character.  c2 is its fold.  This logic will not work for
1549   * Unicode semantics and the german sharp ss, which hence should
1550   * not be compiled into a node that gets here. */
1551   pat_string = STRING(c);
1552   ln  = STR_LEN(c); /* length to match in octets/bytes */
1553
1554   /* We know that we have to match at least 'ln' bytes (which is the
1555   * same as characters, since not utf8).  If we have to match 3
1556   * characters, and there are only 2 availabe, we know without
1557   * trying that it will fail; so don't start a match past the
1558   * required minimum number from the far end */
1559   e = HOP3c(strend, -((I32)ln), s);
1560
1561   if (reginfo->intuit && e < s) {
1562    e = s;   /* Due to minlen logic of intuit() */
1563   }
1564
1565   c1 = *pat_string;
1566   c2 = fold_array[c1];
1567   if (c1 == c2) { /* If char and fold are the same */
1568    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1569   }
1570   else {
1571    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1572   }
1573   break;
1574
1575  do_exactf_utf8:
1576  {
1577   unsigned expansion;
1578
1579   /* If one of the operands is in utf8, we can't use the simpler folding
1580   * above, due to the fact that many different characters can have the
1581   * same fold, or portion of a fold, or different- length fold */
1582   pat_string = STRING(c);
1583   ln  = STR_LEN(c); /* length to match in octets/bytes */
1584   pat_end = pat_string + ln;
1585   lnc = is_utf8_pat       /* length to match in characters */
1586     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1587     : ln;
1588
1589   /* We have 'lnc' characters to match in the pattern, but because of
1590   * multi-character folding, each character in the target can match
1591   * up to 3 characters (Unicode guarantees it will never exceed
1592   * this) if it is utf8-encoded; and up to 2 if not (based on the
1593   * fact that the Latin 1 folds are already determined, and the
1594   * only multi-char fold in that range is the sharp-s folding to
1595   * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1596   * string character.  Adjust lnc accordingly, rounding up, so that
1597   * if we need to match at least 4+1/3 chars, that really is 5. */
1598   expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1599   lnc = (lnc + expansion - 1) / expansion;
1600
1601   /* As in the non-UTF8 case, if we have to match 3 characters, and
1602   * only 2 are left, it's guaranteed to fail, so don't start a
1603   * match that would require us to go beyond the end of the string
1604   */
1605   e = HOP3c(strend, -((I32)lnc), s);
1606
1607   if (reginfo->intuit && e < s) {
1608    e = s;   /* Due to minlen logic of intuit() */
1609   }
1610
1611   /* XXX Note that we could recalculate e to stop the loop earlier,
1612   * as the worst case expansion above will rarely be met, and as we
1613   * go along we would usually find that e moves further to the left.
1614   * This would happen only after we reached the point in the loop
1615   * where if there were no expansion we should fail.  Unclear if
1616   * worth the expense */
1617
1618   while (s <= e) {
1619    char *my_strend= (char *)strend;
1620    if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1621     pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1622     && (reginfo->intuit || regtry(reginfo, &s)) )
1623    {
1624     goto got_it;
1625    }
1626    s += (utf8_target) ? UTF8SKIP(s) : 1;
1627   }
1628   break;
1629  }
1630  case BOUNDL:
1631   RXp_MATCH_TAINTED_on(prog);
1632   FBC_BOUND(isWORDCHAR_LC,
1633     isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1634     isWORDCHAR_LC_utf8((U8*)s));
1635   break;
1636  case NBOUNDL:
1637   RXp_MATCH_TAINTED_on(prog);
1638   FBC_NBOUND(isWORDCHAR_LC,
1639     isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
1640     isWORDCHAR_LC_utf8((U8*)s));
1641   break;
1642  case BOUND:
1643   FBC_BOUND(isWORDCHAR,
1644     isWORDCHAR_uni(tmp),
1645     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1646   break;
1647  case BOUNDA:
1648   FBC_BOUND_NOLOAD(isWORDCHAR_A,
1649       isWORDCHAR_A(tmp),
1650       isWORDCHAR_A((U8*)s));
1651   break;
1652  case NBOUND:
1653   FBC_NBOUND(isWORDCHAR,
1654     isWORDCHAR_uni(tmp),
1655     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1656   break;
1657  case NBOUNDA:
1658   FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1659       isWORDCHAR_A(tmp),
1660       isWORDCHAR_A((U8*)s));
1661   break;
1662  case BOUNDU:
1663   FBC_BOUND(isWORDCHAR_L1,
1664     isWORDCHAR_uni(tmp),
1665     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1666   break;
1667  case NBOUNDU:
1668   FBC_NBOUND(isWORDCHAR_L1,
1669     isWORDCHAR_uni(tmp),
1670     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1671   break;
1672  case LNBREAK:
1673   REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1674       is_LNBREAK_latin1_safe(s, strend)
1675   );
1676   break;
1677
1678  /* The argument to all the POSIX node types is the class number to pass to
1679  * _generic_isCC() to build a mask for searching in PL_charclass[] */
1680
1681  case NPOSIXL:
1682   to_complement = 1;
1683   /* FALLTHROUGH */
1684
1685  case POSIXL:
1686   RXp_MATCH_TAINTED_on(prog);
1687   REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1688       to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1689   break;
1690
1691  case NPOSIXD:
1692   to_complement = 1;
1693   /* FALLTHROUGH */
1694
1695  case POSIXD:
1696   if (utf8_target) {
1697    goto posix_utf8;
1698   }
1699   goto posixa;
1700
1701  case NPOSIXA:
1702   if (utf8_target) {
1703    /* The complement of something that matches only ASCII matches all
1704    * UTF-8 variant code points, plus everything in ASCII that isn't
1705    * in the class */
1706    REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1707          || ! _generic_isCC_A(*s, FLAGS(c)));
1708    break;
1709   }
1710
1711   to_complement = 1;
1712   /* FALLTHROUGH */
1713
1714  case POSIXA:
1715  posixa:
1716   /* Don't need to worry about utf8, as it can match only a single
1717   * byte invariant character. */
1718   REXEC_FBC_CLASS_SCAN(
1719       to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1720   break;
1721
1722  case NPOSIXU:
1723   to_complement = 1;
1724   /* FALLTHROUGH */
1725
1726  case POSIXU:
1727   if (! utf8_target) {
1728    REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1729                  FLAGS(c))));
1730   }
1731   else {
1732
1733  posix_utf8:
1734    classnum = (_char_class_number) FLAGS(c);
1735    if (classnum < _FIRST_NON_SWASH_CC) {
1736     while (s < strend) {
1737
1738      /* We avoid loading in the swash as long as possible, but
1739      * should we have to, we jump to a separate loop.  This
1740      * extra 'if' statement is what keeps this code from being
1741      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1742      if (UTF8_IS_ABOVE_LATIN1(*s)) {
1743       goto found_above_latin1;
1744      }
1745      if ((UTF8_IS_INVARIANT(*s)
1746       && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1747                 classnum)))
1748       || (UTF8_IS_DOWNGRADEABLE_START(*s)
1749        && to_complement ^ cBOOL(
1750         _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)),
1751            classnum))))
1752      {
1753       if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1754        goto got_it;
1755       else {
1756        tmp = doevery;
1757       }
1758      }
1759      else {
1760       tmp = 1;
1761      }
1762      s += UTF8SKIP(s);
1763     }
1764    }
1765    else switch (classnum) {    /* These classes are implemented as
1766           macros */
1767     case _CC_ENUM_SPACE: /* XXX would require separate code if we
1768           revert the change of \v matching this */
1769      /* FALL THROUGH */
1770
1771     case _CC_ENUM_PSXSPC:
1772      REXEC_FBC_UTF8_CLASS_SCAN(
1773           to_complement ^ cBOOL(isSPACE_utf8(s)));
1774      break;
1775
1776     case _CC_ENUM_BLANK:
1777      REXEC_FBC_UTF8_CLASS_SCAN(
1778           to_complement ^ cBOOL(isBLANK_utf8(s)));
1779      break;
1780
1781     case _CC_ENUM_XDIGIT:
1782      REXEC_FBC_UTF8_CLASS_SCAN(
1783          to_complement ^ cBOOL(isXDIGIT_utf8(s)));
1784      break;
1785
1786     case _CC_ENUM_VERTSPACE:
1787      REXEC_FBC_UTF8_CLASS_SCAN(
1788          to_complement ^ cBOOL(isVERTWS_utf8(s)));
1789      break;
1790
1791     case _CC_ENUM_CNTRL:
1792      REXEC_FBC_UTF8_CLASS_SCAN(
1793           to_complement ^ cBOOL(isCNTRL_utf8(s)));
1794      break;
1795
1796     default:
1797      Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
1798      assert(0); /* NOTREACHED */
1799    }
1800   }
1801   break;
1802
1803  found_above_latin1:   /* Here we have to load a swash to get the result
1804        for the current code point */
1805   if (! PL_utf8_swash_ptrs[classnum]) {
1806    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
1807    PL_utf8_swash_ptrs[classnum] =
1808      _core_swash_init("utf8", swash_property_names[classnum],
1809          &PL_sv_undef, 1, 0, NULL, &flags);
1810   }
1811
1812   /* This is a copy of the loop above for swash classes, though using the
1813   * FBC macro instead of being expanded out.  Since we've loaded the
1814   * swash, we don't have to check for that each time through the loop */
1815   REXEC_FBC_UTF8_CLASS_SCAN(
1816     to_complement ^ cBOOL(_generic_utf8(
1817          classnum,
1818          s,
1819          swash_fetch(PL_utf8_swash_ptrs[classnum],
1820             (U8 *) s, TRUE))));
1821   break;
1822
1823  case AHOCORASICKC:
1824  case AHOCORASICK:
1825   {
1826    DECL_TRIE_TYPE(c);
1827    /* what trie are we using right now */
1828    reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1829    reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1830    HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1831
1832    const char *last_start = strend - trie->minlen;
1833 #ifdef DEBUGGING
1834    const char *real_start = s;
1835 #endif
1836    STRLEN maxlen = trie->maxlen;
1837    SV *sv_points;
1838    U8 **points; /* map of where we were in the input string
1839        when reading a given char. For ASCII this
1840        is unnecessary overhead as the relationship
1841        is always 1:1, but for Unicode, especially
1842        case folded Unicode this is not true. */
1843    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1844    U8 *bitmap=NULL;
1845
1846
1847    GET_RE_DEBUG_FLAGS_DECL;
1848
1849    /* We can't just allocate points here. We need to wrap it in
1850    * an SV so it gets freed properly if there is a croak while
1851    * running the match */
1852    ENTER;
1853    SAVETMPS;
1854    sv_points=newSV(maxlen * sizeof(U8 *));
1855    SvCUR_set(sv_points,
1856     maxlen * sizeof(U8 *));
1857    SvPOK_on(sv_points);
1858    sv_2mortal(sv_points);
1859    points=(U8**)SvPV_nolen(sv_points );
1860    if ( trie_type != trie_utf8_fold
1861     && (trie->bitmap || OP(c)==AHOCORASICKC) )
1862    {
1863     if (trie->bitmap)
1864      bitmap=(U8*)trie->bitmap;
1865     else
1866      bitmap=(U8*)ANYOF_BITMAP(c);
1867    }
1868    /* this is the Aho-Corasick algorithm modified a touch
1869    to include special handling for long "unknown char" sequences.
1870    The basic idea being that we use AC as long as we are dealing
1871    with a possible matching char, when we encounter an unknown char
1872    (and we have not encountered an accepting state) we scan forward
1873    until we find a legal starting char.
1874    AC matching is basically that of trie matching, except that when
1875    we encounter a failing transition, we fall back to the current
1876    states "fail state", and try the current char again, a process
1877    we repeat until we reach the root state, state 1, or a legal
1878    transition. If we fail on the root state then we can either
1879    terminate if we have reached an accepting state previously, or
1880    restart the entire process from the beginning if we have not.
1881
1882    */
1883    while (s <= last_start) {
1884     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1885     U8 *uc = (U8*)s;
1886     U16 charid = 0;
1887     U32 base = 1;
1888     U32 state = 1;
1889     UV uvc = 0;
1890     STRLEN len = 0;
1891     STRLEN foldlen = 0;
1892     U8 *uscan = (U8*)NULL;
1893     U8 *leftmost = NULL;
1894 #ifdef DEBUGGING
1895     U32 accepted_word= 0;
1896 #endif
1897     U32 pointpos = 0;
1898
1899     while ( state && uc <= (U8*)strend ) {
1900      int failed=0;
1901      U32 word = aho->states[ state ].wordnum;
1902
1903      if( state==1 ) {
1904       if ( bitmap ) {
1905        DEBUG_TRIE_EXECUTE_r(
1906         if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1907          dump_exec_pos( (char *)uc, c, strend, real_start,
1908           (char *)uc, utf8_target );
1909          PerlIO_printf( Perl_debug_log,
1910           " Scanning for legal start char...\n");
1911         }
1912        );
1913        if (utf8_target) {
1914         while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1915          uc += UTF8SKIP(uc);
1916         }
1917        } else {
1918         while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1919          uc++;
1920         }
1921        }
1922        s= (char *)uc;
1923       }
1924       if (uc >(U8*)last_start) break;
1925      }
1926
1927      if ( word ) {
1928       U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1929       if (!leftmost || lpos < leftmost) {
1930        DEBUG_r(accepted_word=word);
1931        leftmost= lpos;
1932       }
1933       if (base==0) break;
1934
1935      }
1936      points[pointpos++ % maxlen]= uc;
1937      if (foldlen || uc < (U8*)strend) {
1938       REXEC_TRIE_READ_CHAR(trie_type, trie,
1939           widecharmap, uc,
1940           uscan, len, uvc, charid, foldlen,
1941           foldbuf, uniflags);
1942       DEBUG_TRIE_EXECUTE_r({
1943        dump_exec_pos( (char *)uc, c, strend,
1944           real_start, s, utf8_target);
1945        PerlIO_printf(Perl_debug_log,
1946         " Charid:%3u CP:%4"UVxf" ",
1947         charid, uvc);
1948       });
1949      }
1950      else {
1951       len = 0;
1952       charid = 0;
1953      }
1954
1955
1956      do {
1957 #ifdef DEBUGGING
1958       word = aho->states[ state ].wordnum;
1959 #endif
1960       base = aho->states[ state ].trans.base;
1961
1962       DEBUG_TRIE_EXECUTE_r({
1963        if (failed)
1964         dump_exec_pos( (char *)uc, c, strend, real_start,
1965          s,   utf8_target );
1966        PerlIO_printf( Perl_debug_log,
1967         "%sState: %4"UVxf", word=%"UVxf,
1968         failed ? " Fail transition to " : "",
1969         (UV)state, (UV)word);
1970       });
1971       if ( base ) {
1972        U32 tmp;
1973        I32 offset;
1974        if (charid &&
1975         ( ((offset = base + charid
1976          - 1 - trie->uniquecharcount)) >= 0)
1977         && ((U32)offset < trie->lasttrans)
1978         && trie->trans[offset].check == state
1979         && (tmp=trie->trans[offset].next))
1980        {
1981         DEBUG_TRIE_EXECUTE_r(
1982          PerlIO_printf( Perl_debug_log," - legal\n"));
1983         state = tmp;
1984         break;
1985        }
1986        else {
1987         DEBUG_TRIE_EXECUTE_r(
1988          PerlIO_printf( Perl_debug_log," - fail\n"));
1989         failed = 1;
1990         state = aho->fail[state];
1991        }
1992       }
1993       else {
1994        /* we must be accepting here */
1995        DEBUG_TRIE_EXECUTE_r(
1996          PerlIO_printf( Perl_debug_log," - accepting\n"));
1997        failed = 1;
1998        break;
1999       }
2000      } while(state);
2001      uc += len;
2002      if (failed) {
2003       if (leftmost)
2004        break;
2005       if (!state) state = 1;
2006      }
2007     }
2008     if ( aho->states[ state ].wordnum ) {
2009      U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2010      if (!leftmost || lpos < leftmost) {
2011       DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2012       leftmost = lpos;
2013      }
2014     }
2015     if (leftmost) {
2016      s = (char*)leftmost;
2017      DEBUG_TRIE_EXECUTE_r({
2018       PerlIO_printf(
2019        Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2020        (UV)accepted_word, (IV)(s - real_start)
2021       );
2022      });
2023      if (reginfo->intuit || regtry(reginfo, &s)) {
2024       FREETMPS;
2025       LEAVE;
2026       goto got_it;
2027      }
2028      s = HOPc(s,1);
2029      DEBUG_TRIE_EXECUTE_r({
2030       PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2031      });
2032     } else {
2033      DEBUG_TRIE_EXECUTE_r(
2034       PerlIO_printf( Perl_debug_log,"No match.\n"));
2035      break;
2036     }
2037    }
2038    FREETMPS;
2039    LEAVE;
2040   }
2041   break;
2042  default:
2043   Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2044   break;
2045  }
2046  return 0;
2047   got_it:
2048  return s;
2049 }
2050
2051
2052 /*
2053  - regexec_flags - match a regexp against a string
2054  */
2055 I32
2056 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2057    char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2058 /* stringarg: the point in the string at which to begin matching */
2059 /* strend:    pointer to null at end of string */
2060 /* strbeg:    real beginning of string */
2061 /* minend:    end of match must be >= minend bytes after stringarg. */
2062 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2063  *            itself is accessed via the pointers above */
2064 /* data:      May be used for some additional optimizations.
2065    Currently its only used, with a U32 cast, for transmitting
2066    the ganch offset when doing a /g match. This will change */
2067 /* nosave:    For optimizations. */
2068
2069 {
2070  dVAR;
2071  struct regexp *const prog = ReANY(rx);
2072  char *s;
2073  regnode *c;
2074  char *startpos = stringarg;
2075  I32 minlen;  /* must match at least this many chars */
2076  I32 dontbother = 0; /* how many characters not to try at end */
2077  I32 end_shift = 0;   /* Same for the end. */  /* CC */
2078  I32 scream_pos = -1;  /* Internal iterator of scream. */
2079  char *scream_olds = NULL;
2080  const bool utf8_target = cBOOL(DO_UTF8(sv));
2081  I32 multiline;
2082  RXi_GET_DECL(prog,progi);
2083  regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2084  regmatch_info *const reginfo = &reginfo_buf;
2085  regexp_paren_pair *swap = NULL;
2086  I32 oldsave;
2087  GET_RE_DEBUG_FLAGS_DECL;
2088
2089  PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2090  PERL_UNUSED_ARG(data);
2091
2092  /* Be paranoid... */
2093  if (prog == NULL || startpos == NULL) {
2094   Perl_croak(aTHX_ "NULL regexp parameter");
2095   return 0;
2096  }
2097
2098  DEBUG_EXECUTE_r(
2099   debug_start_match(rx, utf8_target, startpos, strend,
2100   "Matching");
2101  );
2102
2103
2104  /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2105  * which will call destuctors to reset PL_regmatch_state, free higher
2106  * PL_regmatch_slabs, and clean up regmatch_info_aux and
2107  * regmatch_info_aux_eval */
2108
2109  oldsave = PL_savestack_ix;
2110
2111  multiline = prog->extflags & RXf_PMf_MULTILINE;
2112  minlen = prog->minlen;
2113
2114  if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2115   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2116        "String too short [regexec_flags]...\n"));
2117   goto phooey;
2118  }
2119
2120  /* Check validity of program. */
2121  if (UCHARAT(progi->program) != REG_MAGIC) {
2122   Perl_croak(aTHX_ "corrupted regexp program");
2123  }
2124
2125  RX_MATCH_TAINTED_off(rx);
2126
2127  reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2128  reginfo->intuit = 0;
2129  reginfo->is_utf8_target = cBOOL(utf8_target);
2130  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2131  reginfo->warned = FALSE;
2132  reginfo->strbeg  = strbeg;
2133  reginfo->sv = sv;
2134  reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2135  reginfo->strend = strend;
2136  /* see how far we have to get to not match where we matched before */
2137  reginfo->till = startpos+minend;
2138
2139  /* reserve next 2 or 3 slots in PL_regmatch_state:
2140  * slot N+0: may currently be in use: skip it
2141  * slot N+1: use for regmatch_info_aux struct
2142  * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2143  * slot N+3: ready for use by regmatch()
2144  */
2145
2146  {
2147   regmatch_state *old_regmatch_state;
2148   regmatch_slab  *old_regmatch_slab;
2149   int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2150
2151   /* on first ever match, allocate first slab */
2152   if (!PL_regmatch_slab) {
2153    Newx(PL_regmatch_slab, 1, regmatch_slab);
2154    PL_regmatch_slab->prev = NULL;
2155    PL_regmatch_slab->next = NULL;
2156    PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2157   }
2158
2159   old_regmatch_state = PL_regmatch_state;
2160   old_regmatch_slab  = PL_regmatch_slab;
2161
2162   for (i=0; i <= max; i++) {
2163    if (i == 1)
2164     reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2165    else if (i ==2)
2166     reginfo->info_aux_eval =
2167     reginfo->info_aux->info_aux_eval =
2168        &(PL_regmatch_state->u.info_aux_eval);
2169
2170    if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2171     PL_regmatch_state = S_push_slab(aTHX);
2172   }
2173
2174   /* note initial PL_regmatch_state position; at end of match we'll
2175   * pop back to there and free any higher slabs */
2176
2177   reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2178   reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2179   reginfo->info_aux->poscache = NULL;
2180
2181   SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2182
2183   if ((prog->extflags & RXf_EVAL_SEEN))
2184    S_setup_eval_state(aTHX_ reginfo);
2185   else
2186    reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2187  }
2188
2189  /* If there is a "must appear" string, look for it. */
2190  s = startpos;
2191
2192  if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2193   MAGIC *mg;
2194   if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2195    reginfo->ganch = startpos + prog->gofs;
2196    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2197    "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2198   } else if (sv && SvTYPE(sv) >= SVt_PVMG
2199     && SvMAGIC(sv)
2200     && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2201     && mg->mg_len >= 0) {
2202    reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */
2203    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2204     "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2205
2206    if (prog->extflags & RXf_ANCH_GPOS) {
2207     if (s > reginfo->ganch)
2208      goto phooey;
2209     s = reginfo->ganch - prog->gofs;
2210     DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2211      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2212     if (s < strbeg)
2213      goto phooey;
2214    }
2215   }
2216   else if (data) {
2217    reginfo->ganch = strbeg + PTR2UV(data);
2218    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2219     "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2220
2221   } else {    /* pos() not defined */
2222    reginfo->ganch = strbeg;
2223    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2224     "GPOS: reginfo->ganch = strbeg\n"));
2225   }
2226  }
2227  if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2228   /* We have to be careful. If the previous successful match
2229   was from this regex we don't want a subsequent partially
2230   successful match to clobber the old results.
2231   So when we detect this possibility we add a swap buffer
2232   to the re, and switch the buffer each match. If we fail,
2233   we switch it back; otherwise we leave it swapped.
2234   */
2235   swap = prog->offs;
2236   /* do we need a save destructor here for eval dies? */
2237   Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2238   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2239    "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2240    PTR2UV(prog),
2241    PTR2UV(swap),
2242    PTR2UV(prog->offs)
2243   ));
2244  }
2245  if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2246   re_scream_pos_data d;
2247
2248   d.scream_olds = &scream_olds;
2249   d.scream_pos = &scream_pos;
2250   s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d);
2251   if (!s) {
2252    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2253    goto phooey; /* not present */
2254   }
2255  }
2256
2257
2258
2259  /* Simplest case:  anchored match need be tried only once. */
2260  /*  [unless only anchor is BOL and multiline is set] */
2261  if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2262   if (s == startpos && regtry(reginfo, &startpos))
2263    goto got_it;
2264   else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2265     || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2266   {
2267    char *end;
2268
2269    if (minlen)
2270     dontbother = minlen - 1;
2271    end = HOP3c(strend, -dontbother, strbeg) - 1;
2272    /* for multiline we only have to try after newlines */
2273    if (prog->check_substr || prog->check_utf8) {
2274     /* because of the goto we can not easily reuse the macros for bifurcating the
2275     unicode/non-unicode match modes here like we do elsewhere - demerphq */
2276     if (utf8_target) {
2277      if (s == startpos)
2278       goto after_try_utf8;
2279      while (1) {
2280       if (regtry(reginfo, &s)) {
2281        goto got_it;
2282       }
2283      after_try_utf8:
2284       if (s > end) {
2285        goto phooey;
2286       }
2287       if (prog->extflags & RXf_USE_INTUIT) {
2288        s = re_intuit_start(rx, sv, strbeg,
2289          s + UTF8SKIP(s), strend, flags, NULL);
2290        if (!s) {
2291         goto phooey;
2292        }
2293       }
2294       else {
2295        s += UTF8SKIP(s);
2296       }
2297      }
2298     } /* end search for check string in unicode */
2299     else {
2300      if (s == startpos) {
2301       goto after_try_latin;
2302      }
2303      while (1) {
2304       if (regtry(reginfo, &s)) {
2305        goto got_it;
2306       }
2307      after_try_latin:
2308       if (s > end) {
2309        goto phooey;
2310       }
2311       if (prog->extflags & RXf_USE_INTUIT) {
2312        s = re_intuit_start(rx, sv, strbeg,
2313           s + 1, strend, flags, NULL);
2314        if (!s) {
2315         goto phooey;
2316        }
2317       }
2318       else {
2319        s++;
2320       }
2321      }
2322     } /* end search for check string in latin*/
2323    } /* end search for check string */
2324    else { /* search for newline */
2325     if (s > startpos) {
2326      /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2327      s--;
2328     }
2329     /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2330     while (s <= end) { /* note it could be possible to match at the end of the string */
2331      if (*s++ == '\n') { /* don't need PL_utf8skip here */
2332       if (regtry(reginfo, &s))
2333        goto got_it;
2334      }
2335     }
2336    } /* end search for newline */
2337   } /* end anchored/multiline check string search */
2338   goto phooey;
2339  } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2340  {
2341   /* the warning about reginfo->ganch being used without initialization
2342   is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2343   and we only enter this block when the same bit is set. */
2344   char *tmp_s = reginfo->ganch - prog->gofs;
2345
2346   if (tmp_s >= strbeg && regtry(reginfo, &tmp_s))
2347    goto got_it;
2348   goto phooey;
2349  }
2350
2351  /* Messy cases:  unanchored match. */
2352  if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2353   /* we have /x+whatever/ */
2354   /* it must be a one character string (XXXX Except is_utf8_pat?) */
2355   char ch;
2356 #ifdef DEBUGGING
2357   int did_match = 0;
2358 #endif
2359   if (utf8_target) {
2360    if (! prog->anchored_utf8) {
2361     to_utf8_substr(prog);
2362    }
2363    ch = SvPVX_const(prog->anchored_utf8)[0];
2364    REXEC_FBC_SCAN(
2365     if (*s == ch) {
2366      DEBUG_EXECUTE_r( did_match = 1 );
2367      if (regtry(reginfo, &s)) goto got_it;
2368      s += UTF8SKIP(s);
2369      while (s < strend && *s == ch)
2370       s += UTF8SKIP(s);
2371     }
2372    );
2373
2374   }
2375   else {
2376    if (! prog->anchored_substr) {
2377     if (! to_byte_substr(prog)) {
2378      NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2379     }
2380    }
2381    ch = SvPVX_const(prog->anchored_substr)[0];
2382    REXEC_FBC_SCAN(
2383     if (*s == ch) {
2384      DEBUG_EXECUTE_r( did_match = 1 );
2385      if (regtry(reginfo, &s)) goto got_it;
2386      s++;
2387      while (s < strend && *s == ch)
2388       s++;
2389     }
2390    );
2391   }
2392   DEBUG_EXECUTE_r(if (!did_match)
2393     PerlIO_printf(Perl_debug_log,
2394         "Did not find anchored character...\n")
2395    );
2396  }
2397  else if (prog->anchored_substr != NULL
2398    || prog->anchored_utf8 != NULL
2399    || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2400     && prog->float_max_offset < strend - s)) {
2401   SV *must;
2402   I32 back_max;
2403   I32 back_min;
2404   char *last;
2405   char *last1;  /* Last position checked before */
2406 #ifdef DEBUGGING
2407   int did_match = 0;
2408 #endif
2409   if (prog->anchored_substr || prog->anchored_utf8) {
2410    if (utf8_target) {
2411     if (! prog->anchored_utf8) {
2412      to_utf8_substr(prog);
2413     }
2414     must = prog->anchored_utf8;
2415    }
2416    else {
2417     if (! prog->anchored_substr) {
2418      if (! to_byte_substr(prog)) {
2419       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2420      }
2421     }
2422     must = prog->anchored_substr;
2423    }
2424    back_max = back_min = prog->anchored_offset;
2425   } else {
2426    if (utf8_target) {
2427     if (! prog->float_utf8) {
2428      to_utf8_substr(prog);
2429     }
2430     must = prog->float_utf8;
2431    }
2432    else {
2433     if (! prog->float_substr) {
2434      if (! to_byte_substr(prog)) {
2435       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2436      }
2437     }
2438     must = prog->float_substr;
2439    }
2440    back_max = prog->float_max_offset;
2441    back_min = prog->float_min_offset;
2442   }
2443
2444   if (back_min<0) {
2445    last = strend;
2446   } else {
2447    last = HOP3c(strend, /* Cannot start after this */
2448     -(I32)(CHR_SVLEN(must)
2449       - (SvTAIL(must) != 0) + back_min), strbeg);
2450   }
2451   if (s > reginfo->strbeg)
2452    last1 = HOPc(s, -1);
2453   else
2454    last1 = s - 1; /* bogus */
2455
2456   /* XXXX check_substr already used to find "s", can optimize if
2457   check_substr==must. */
2458   scream_pos = -1;
2459   dontbother = end_shift;
2460   strend = HOPc(strend, -dontbother);
2461   while ( (s <= last) &&
2462     (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2463         (unsigned char*)strend, must,
2464         multiline ? FBMrf_MULTILINE : 0)) ) {
2465    DEBUG_EXECUTE_r( did_match = 1 );
2466    if (HOPc(s, -back_max) > last1) {
2467     last1 = HOPc(s, -back_min);
2468     s = HOPc(s, -back_max);
2469    }
2470    else {
2471     char * const t = (last1 >= reginfo->strbeg)
2472          ? HOPc(last1, 1) : last1 + 1;
2473
2474     last1 = HOPc(s, -back_min);
2475     s = t;
2476    }
2477    if (utf8_target) {
2478     while (s <= last1) {
2479      if (regtry(reginfo, &s))
2480       goto got_it;
2481      if (s >= last1) {
2482       s++; /* to break out of outer loop */
2483       break;
2484      }
2485      s += UTF8SKIP(s);
2486     }
2487    }
2488    else {
2489     while (s <= last1) {
2490      if (regtry(reginfo, &s))
2491       goto got_it;
2492      s++;
2493     }
2494    }
2495   }
2496   DEBUG_EXECUTE_r(if (!did_match) {
2497    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2498     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2499    PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2500        ((must == prog->anchored_substr || must == prog->anchored_utf8)
2501        ? "anchored" : "floating"),
2502     quoted, RE_SV_TAIL(must));
2503   });
2504   goto phooey;
2505  }
2506  else if ( (c = progi->regstclass) ) {
2507   if (minlen) {
2508    const OPCODE op = OP(progi->regstclass);
2509    /* don't bother with what can't match */
2510    if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2511     strend = HOPc(strend, -(minlen - 1));
2512   }
2513   DEBUG_EXECUTE_r({
2514    SV * const prop = sv_newmortal();
2515    regprop(prog, prop, c);
2516    {
2517     RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2518      s,strend-s,60);
2519     PerlIO_printf(Perl_debug_log,
2520      "Matching stclass %.*s against %s (%d bytes)\n",
2521      (int)SvCUR(prop), SvPVX_const(prop),
2522      quoted, (int)(strend - s));
2523    }
2524   });
2525   if (find_byclass(prog, c, s, strend, reginfo))
2526    goto got_it;
2527   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2528  }
2529  else {
2530   dontbother = 0;
2531   if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2532    /* Trim the end. */
2533    char *last= NULL;
2534    SV* float_real;
2535    STRLEN len;
2536    const char *little;
2537
2538    if (utf8_target) {
2539     if (! prog->float_utf8) {
2540      to_utf8_substr(prog);
2541     }
2542     float_real = prog->float_utf8;
2543    }
2544    else {
2545     if (! prog->float_substr) {
2546      if (! to_byte_substr(prog)) {
2547       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2548      }
2549     }
2550     float_real = prog->float_substr;
2551    }
2552
2553    little = SvPV_const(float_real, len);
2554    if (SvTAIL(float_real)) {
2555      /* This means that float_real contains an artificial \n on
2556      * the end due to the presence of something like this:
2557      * /foo$/ where we can match both "foo" and "foo\n" at the
2558      * end of the string.  So we have to compare the end of the
2559      * string first against the float_real without the \n and
2560      * then against the full float_real with the string.  We
2561      * have to watch out for cases where the string might be
2562      * smaller than the float_real or the float_real without
2563      * the \n. */
2564      char *checkpos= strend - len;
2565      DEBUG_OPTIMISE_r(
2566       PerlIO_printf(Perl_debug_log,
2567        "%sChecking for float_real.%s\n",
2568        PL_colors[4], PL_colors[5]));
2569      if (checkpos + 1 < strbeg) {
2570       /* can't match, even if we remove the trailing \n
2571       * string is too short to match */
2572       DEBUG_EXECUTE_r(
2573        PerlIO_printf(Perl_debug_log,
2574         "%sString shorter than required trailing substring, cannot match.%s\n",
2575         PL_colors[4], PL_colors[5]));
2576       goto phooey;
2577      } else if (memEQ(checkpos + 1, little, len - 1)) {
2578       /* can match, the end of the string matches without the
2579       * "\n" */
2580       last = checkpos + 1;
2581      } else if (checkpos < strbeg) {
2582       /* cant match, string is too short when the "\n" is
2583       * included */
2584       DEBUG_EXECUTE_r(
2585        PerlIO_printf(Perl_debug_log,
2586         "%sString does not contain required trailing substring, cannot match.%s\n",
2587         PL_colors[4], PL_colors[5]));
2588       goto phooey;
2589      } else if (!multiline) {
2590       /* non multiline match, so compare with the "\n" at the
2591       * end of the string */
2592       if (memEQ(checkpos, little, len)) {
2593        last= checkpos;
2594       } else {
2595        DEBUG_EXECUTE_r(
2596         PerlIO_printf(Perl_debug_log,
2597          "%sString does not contain required trailing substring, cannot match.%s\n",
2598          PL_colors[4], PL_colors[5]));
2599        goto phooey;
2600       }
2601      } else {
2602       /* multiline match, so we have to search for a place
2603       * where the full string is located */
2604       goto find_last;
2605      }
2606    } else {
2607     find_last:
2608      if (len)
2609       last = rninstr(s, strend, little, little + len);
2610      else
2611       last = strend; /* matching "$" */
2612    }
2613    if (!last) {
2614     /* at one point this block contained a comment which was
2615     * probably incorrect, which said that this was a "should not
2616     * happen" case.  Even if it was true when it was written I am
2617     * pretty sure it is not anymore, so I have removed the comment
2618     * and replaced it with this one. Yves */
2619     DEBUG_EXECUTE_r(
2620      PerlIO_printf(Perl_debug_log,
2621       "String does not contain required substring, cannot match.\n"
2622      ));
2623     goto phooey;
2624    }
2625    dontbother = strend - last + prog->float_min_offset;
2626   }
2627   if (minlen && (dontbother < minlen))
2628    dontbother = minlen - 1;
2629   strend -= dontbother;      /* this one's always in bytes! */
2630   /* We don't know much -- general case. */
2631   if (utf8_target) {
2632    for (;;) {
2633     if (regtry(reginfo, &s))
2634      goto got_it;
2635     if (s >= strend)
2636      break;
2637     s += UTF8SKIP(s);
2638    };
2639   }
2640   else {
2641    do {
2642     if (regtry(reginfo, &s))
2643      goto got_it;
2644    } while (s++ < strend);
2645   }
2646  }
2647
2648  /* Failure. */
2649  goto phooey;
2650
2651 got_it:
2652  DEBUG_BUFFERS_r(
2653   if (swap)
2654    PerlIO_printf(Perl_debug_log,
2655     "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2656     PTR2UV(prog),
2657     PTR2UV(swap)
2658    );
2659  );
2660  Safefree(swap);
2661
2662  /* clean up; this will trigger destructors that will free all slabs
2663  * above the current one, and cleanup the regmatch_info_aux
2664  * and regmatch_info_aux_eval sructs */
2665
2666  LEAVE_SCOPE(oldsave);
2667
2668  if (RXp_PAREN_NAMES(prog))
2669   (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2670
2671  RX_MATCH_UTF8_set(rx, utf8_target);
2672
2673  /* make sure $`, $&, $', and $digit will work later */
2674  if ( !(flags & REXEC_NOT_FIRST) ) {
2675   if (flags & REXEC_COPY_STR) {
2676 #ifdef PERL_ANY_COW
2677    if (SvCANCOW(sv)) {
2678     if (DEBUG_C_TEST) {
2679      PerlIO_printf(Perl_debug_log,
2680         "Copy on write: regexp capture, type %d\n",
2681         (int) SvTYPE(sv));
2682     }
2683     RX_MATCH_COPY_FREE(rx);
2684     prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2685     prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2686     assert (SvPOKp(prog->saved_copy));
2687     prog->sublen  = reginfo->strend - strbeg;
2688     prog->suboffset = 0;
2689     prog->subcoffset = 0;
2690    } else
2691 #endif
2692    {
2693     I32 min = 0;
2694     I32 max = reginfo->strend - strbeg;
2695     I32 sublen;
2696
2697     if (    (flags & REXEC_COPY_SKIP_POST)
2698      && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2699      && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2700     ) { /* don't copy $' part of string */
2701      U32 n = 0;
2702      max = -1;
2703      /* calculate the right-most part of the string covered
2704      * by a capture. Due to look-ahead, this may be to
2705      * the right of $&, so we have to scan all captures */
2706      while (n <= prog->lastparen) {
2707       if (prog->offs[n].end > max)
2708        max = prog->offs[n].end;
2709       n++;
2710      }
2711      if (max == -1)
2712       max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2713         ? prog->offs[0].start
2714         : 0;
2715      assert(max >= 0 && max <= reginfo->strend - strbeg);
2716     }
2717
2718     if (    (flags & REXEC_COPY_SKIP_PRE)
2719      && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2720      && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2721     ) { /* don't copy $` part of string */
2722      U32 n = 0;
2723      min = max;
2724      /* calculate the left-most part of the string covered
2725      * by a capture. Due to look-behind, this may be to
2726      * the left of $&, so we have to scan all captures */
2727      while (min && n <= prog->lastparen) {
2728       if (   prog->offs[n].start != -1
2729        && prog->offs[n].start < min)
2730       {
2731        min = prog->offs[n].start;
2732       }
2733       n++;
2734      }
2735      if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2736       && min >  prog->offs[0].end
2737      )
2738       min = prog->offs[0].end;
2739
2740     }
2741
2742     assert(min >= 0 && min <= max
2743      && min <= reginfo->strend - strbeg);
2744     sublen = max - min;
2745
2746     if (RX_MATCH_COPIED(rx)) {
2747      if (sublen > prog->sublen)
2748       prog->subbeg =
2749         (char*)saferealloc(prog->subbeg, sublen+1);
2750     }
2751     else
2752      prog->subbeg = (char*)safemalloc(sublen+1);
2753     Copy(strbeg + min, prog->subbeg, sublen, char);
2754     prog->subbeg[sublen] = '\0';
2755     prog->suboffset = min;
2756     prog->sublen = sublen;
2757     RX_MATCH_COPIED_on(rx);
2758    }
2759    prog->subcoffset = prog->suboffset;
2760    if (prog->suboffset && utf8_target) {
2761     /* Convert byte offset to chars.
2762     * XXX ideally should only compute this if @-/@+
2763     * has been seen, a la PL_sawampersand ??? */
2764
2765     /* If there's a direct correspondence between the
2766     * string which we're matching and the original SV,
2767     * then we can use the utf8 len cache associated with
2768     * the SV. In particular, it means that under //g,
2769     * sv_pos_b2u() will use the previously cached
2770     * position to speed up working out the new length of
2771     * subcoffset, rather than counting from the start of
2772     * the string each time. This stops
2773     *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2774     * from going quadratic */
2775     if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2776      sv_pos_b2u(sv, &(prog->subcoffset));
2777     else
2778      prog->subcoffset = utf8_length((U8*)strbeg,
2779           (U8*)(strbeg+prog->suboffset));
2780    }
2781   }
2782   else {
2783    RX_MATCH_COPY_FREE(rx);
2784    prog->subbeg = strbeg;
2785    prog->suboffset = 0;
2786    prog->subcoffset = 0;
2787    /* use reginfo->strend, as strend may have been modified */
2788    prog->sublen = reginfo->strend - strbeg;
2789   }
2790  }
2791
2792  return 1;
2793
2794 phooey:
2795  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2796       PL_colors[4], PL_colors[5]));
2797
2798  /* clean up; this will trigger destructors that will free all slabs
2799  * above the current one, and cleanup the regmatch_info_aux
2800  * and regmatch_info_aux_eval sructs */
2801
2802  LEAVE_SCOPE(oldsave);
2803
2804  if (swap) {
2805   /* we failed :-( roll it back */
2806   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2807    "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2808    PTR2UV(prog),
2809    PTR2UV(prog->offs),
2810    PTR2UV(swap)
2811   ));
2812   Safefree(prog->offs);
2813   prog->offs = swap;
2814  }
2815  return 0;
2816 }
2817
2818
2819 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
2820  * Do inc before dec, in case old and new rex are the same */
2821 #define SET_reg_curpm(Re2) \
2822  if (reginfo->info_aux_eval) {                   \
2823   (void)ReREFCNT_inc(Re2);      \
2824   ReREFCNT_dec(PM_GETRE(PL_reg_curpm));     \
2825   PM_SETRE((PL_reg_curpm), (Re2));     \
2826  }
2827
2828
2829 /*
2830  - regtry - try match at specific point
2831  */
2832 STATIC I32   /* 0 failure, 1 success */
2833 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2834 {
2835  dVAR;
2836  CHECKPOINT lastcp;
2837  REGEXP *const rx = reginfo->prog;
2838  regexp *const prog = ReANY(rx);
2839  I32 result;
2840  RXi_GET_DECL(prog,progi);
2841  GET_RE_DEBUG_FLAGS_DECL;
2842
2843  PERL_ARGS_ASSERT_REGTRY;
2844
2845  reginfo->cutpoint=NULL;
2846
2847  prog->offs[0].start = *startposp - reginfo->strbeg;
2848  prog->lastparen = 0;
2849  prog->lastcloseparen = 0;
2850
2851  /* XXXX What this code is doing here?!!!  There should be no need
2852  to do this again and again, prog->lastparen should take care of
2853  this!  --ilya*/
2854
2855  /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2856  * Actually, the code in regcppop() (which Ilya may be meaning by
2857  * prog->lastparen), is not needed at all by the test suite
2858  * (op/regexp, op/pat, op/split), but that code is needed otherwise
2859  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2860  * Meanwhile, this code *is* needed for the
2861  * above-mentioned test suite tests to succeed.  The common theme
2862  * on those tests seems to be returning null fields from matches.
2863  * --jhi updated by dapm */
2864 #if 1
2865  if (prog->nparens) {
2866   regexp_paren_pair *pp = prog->offs;
2867   I32 i;
2868   for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2869    ++pp;
2870    pp->start = -1;
2871    pp->end = -1;
2872   }
2873  }
2874 #endif
2875  REGCP_SET(lastcp);
2876  result = regmatch(reginfo, *startposp, progi->program + 1);
2877  if (result != -1) {
2878   prog->offs[0].end = result;
2879   return 1;
2880  }
2881  if (reginfo->cutpoint)
2882   *startposp= reginfo->cutpoint;
2883  REGCP_UNWIND(lastcp);
2884  return 0;
2885 }
2886
2887
2888 #define sayYES goto yes
2889 #define sayNO goto no
2890 #define sayNO_SILENT goto no_silent
2891
2892 /* we dont use STMT_START/END here because it leads to
2893    "unreachable code" warnings, which are bogus, but distracting. */
2894 #define CACHEsayNO \
2895  if (ST.cache_mask) \
2896  reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
2897  sayNO
2898
2899 /* this is used to determine how far from the left messages like
2900    'failed...' are printed. It should be set such that messages
2901    are inline with the regop output that created them.
2902 */
2903 #define REPORT_CODE_OFF 32
2904
2905
2906 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2907 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2908 #define CHRTEST_NOT_A_CP_1 -999
2909 #define CHRTEST_NOT_A_CP_2 -998
2910
2911 /* grab a new slab and return the first slot in it */
2912
2913 STATIC regmatch_state *
2914 S_push_slab(pTHX)
2915 {
2916 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2917  dMY_CXT;
2918 #endif
2919  regmatch_slab *s = PL_regmatch_slab->next;
2920  if (!s) {
2921   Newx(s, 1, regmatch_slab);
2922   s->prev = PL_regmatch_slab;
2923   s->next = NULL;
2924   PL_regmatch_slab->next = s;
2925  }
2926  PL_regmatch_slab = s;
2927  return SLAB_FIRST(s);
2928 }
2929
2930
2931 /* push a new state then goto it */
2932
2933 #define PUSH_STATE_GOTO(state, node, input) \
2934  pushinput = input; \
2935  scan = node; \
2936  st->resume_state = state; \
2937  goto push_state;
2938
2939 /* push a new state with success backtracking, then goto it */
2940
2941 #define PUSH_YES_STATE_GOTO(state, node, input) \
2942  pushinput = input; \
2943  scan = node; \
2944  st->resume_state = state; \
2945  goto push_yes_state;
2946
2947
2948
2949
2950 /*
2951
2952 regmatch() - main matching routine
2953
2954 This is basically one big switch statement in a loop. We execute an op,
2955 set 'next' to point the next op, and continue. If we come to a point which
2956 we may need to backtrack to on failure such as (A|B|C), we push a
2957 backtrack state onto the backtrack stack. On failure, we pop the top
2958 state, and re-enter the loop at the state indicated. If there are no more
2959 states to pop, we return failure.
2960
2961 Sometimes we also need to backtrack on success; for example /A+/, where
2962 after successfully matching one A, we need to go back and try to
2963 match another one; similarly for lookahead assertions: if the assertion
2964 completes successfully, we backtrack to the state just before the assertion
2965 and then carry on.  In these cases, the pushed state is marked as
2966 'backtrack on success too'. This marking is in fact done by a chain of
2967 pointers, each pointing to the previous 'yes' state. On success, we pop to
2968 the nearest yes state, discarding any intermediate failure-only states.
2969 Sometimes a yes state is pushed just to force some cleanup code to be
2970 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2971 it to free the inner regex.
2972
2973 Note that failure backtracking rewinds the cursor position, while
2974 success backtracking leaves it alone.
2975
2976 A pattern is complete when the END op is executed, while a subpattern
2977 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2978 ops trigger the "pop to last yes state if any, otherwise return true"
2979 behaviour.
2980
2981 A common convention in this function is to use A and B to refer to the two
2982 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2983 the subpattern to be matched possibly multiple times, while B is the entire
2984 rest of the pattern. Variable and state names reflect this convention.
2985
2986 The states in the main switch are the union of ops and failure/success of
2987 substates associated with with that op.  For example, IFMATCH is the op
2988 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2989 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2990 successfully matched A and IFMATCH_A_fail is a state saying that we have
2991 just failed to match A. Resume states always come in pairs. The backtrack
2992 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2993 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2994 on success or failure.
2995
2996 The struct that holds a backtracking state is actually a big union, with
2997 one variant for each major type of op. The variable st points to the
2998 top-most backtrack struct. To make the code clearer, within each
2999 block of code we #define ST to alias the relevant union.
3000
3001 Here's a concrete example of a (vastly oversimplified) IFMATCH
3002 implementation:
3003
3004  switch (state) {
3005  ....
3006
3007 #define ST st->u.ifmatch
3008
3009  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3010   ST.foo = ...; // some state we wish to save
3011   ...
3012   // push a yes backtrack state with a resume value of
3013   // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3014   // first node of A:
3015   PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3016   // NOTREACHED
3017
3018  case IFMATCH_A: // we have successfully executed A; now continue with B
3019   next = B;
3020   bar = ST.foo; // do something with the preserved value
3021   break;
3022
3023  case IFMATCH_A_fail: // A failed, so the assertion failed
3024   ...;   // do some housekeeping, then ...
3025   sayNO; // propagate the failure
3026
3027 #undef ST
3028
3029  ...
3030  }
3031
3032 For any old-timers reading this who are familiar with the old recursive
3033 approach, the code above is equivalent to:
3034
3035  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3036  {
3037   int foo = ...
3038   ...
3039   if (regmatch(A)) {
3040    next = B;
3041    bar = foo;
3042    break;
3043   }
3044   ...;   // do some housekeeping, then ...
3045   sayNO; // propagate the failure
3046  }
3047
3048 The topmost backtrack state, pointed to by st, is usually free. If you
3049 want to claim it, populate any ST.foo fields in it with values you wish to
3050 save, then do one of
3051
3052   PUSH_STATE_GOTO(resume_state, node, newinput);
3053   PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3054
3055 which sets that backtrack state's resume value to 'resume_state', pushes a
3056 new free entry to the top of the backtrack stack, then goes to 'node'.
3057 On backtracking, the free slot is popped, and the saved state becomes the
3058 new free state. An ST.foo field in this new top state can be temporarily
3059 accessed to retrieve values, but once the main loop is re-entered, it
3060 becomes available for reuse.
3061
3062 Note that the depth of the backtrack stack constantly increases during the
3063 left-to-right execution of the pattern, rather than going up and down with
3064 the pattern nesting. For example the stack is at its maximum at Z at the
3065 end of the pattern, rather than at X in the following:
3066
3067  /(((X)+)+)+....(Y)+....Z/
3068
3069 The only exceptions to this are lookahead/behind assertions and the cut,
3070 (?>A), which pop all the backtrack states associated with A before
3071 continuing.
3072
3073 Backtrack state structs are allocated in slabs of about 4K in size.
3074 PL_regmatch_state and st always point to the currently active state,
3075 and PL_regmatch_slab points to the slab currently containing
3076 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3077 allocated, and is never freed until interpreter destruction. When the slab
3078 is full, a new one is allocated and chained to the end. At exit from
3079 regmatch(), slabs allocated since entry are freed.
3080
3081 */
3082
3083
3084 #define DEBUG_STATE_pp(pp)        \
3085  DEBUG_STATE_r({         \
3086   DUMP_EXEC_POS(locinput, scan, utf8_target);      \
3087   PerlIO_printf(Perl_debug_log,       \
3088    "    %*s"pp" %s%s%s%s%s\n",       \
3089    depth*2, "",        \
3090    PL_reg_name[st->resume_state],                     \
3091    ((st==yes_state||st==mark_state) ? "[" : ""),   \
3092    ((st==yes_state) ? "Y" : ""),                   \
3093    ((st==mark_state) ? "M" : ""),                  \
3094    ((st==yes_state||st==mark_state) ? "]" : "")    \
3095   );                                                  \
3096  });
3097
3098
3099 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3100
3101 #ifdef DEBUGGING
3102
3103 STATIC void
3104 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3105  const char *start, const char *end, const char *blurb)
3106 {
3107  const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3108
3109  PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3110
3111  if (!PL_colorset)
3112    reginitcolors();
3113  {
3114   RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3115    RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3116
3117   RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3118    start, end - start, 60);
3119
3120   PerlIO_printf(Perl_debug_log,
3121    "%s%s REx%s %s against %s\n",
3122      PL_colors[4], blurb, PL_colors[5], s0, s1);
3123
3124   if (utf8_target||utf8_pat)
3125    PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3126     utf8_pat ? "pattern" : "",
3127     utf8_pat && utf8_target ? " and " : "",
3128     utf8_target ? "string" : ""
3129    );
3130  }
3131 }
3132
3133 STATIC void
3134 S_dump_exec_pos(pTHX_ const char *locinput,
3135      const regnode *scan,
3136      const char *loc_regeol,
3137      const char *loc_bostr,
3138      const char *loc_reg_starttry,
3139      const bool utf8_target)
3140 {
3141  const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3142  const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3143  int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3144  /* The part of the string before starttry has one color
3145  (pref0_len chars), between starttry and current
3146  position another one (pref_len - pref0_len chars),
3147  after the current position the third one.
3148  We assume that pref0_len <= pref_len, otherwise we
3149  decrease pref0_len.  */
3150  int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3151   ? (5 + taill) - l : locinput - loc_bostr;
3152  int pref0_len;
3153
3154  PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3155
3156  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3157   pref_len++;
3158  pref0_len = pref_len  - (locinput - loc_reg_starttry);
3159  if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3160   l = ( loc_regeol - locinput > (5 + taill) - pref_len
3161    ? (5 + taill) - pref_len : loc_regeol - locinput);
3162  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3163   l--;
3164  if (pref0_len < 0)
3165   pref0_len = 0;
3166  if (pref0_len > pref_len)
3167   pref0_len = pref_len;
3168  {
3169   const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3170
3171   RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3172    (locinput - pref_len),pref0_len, 60, 4, 5);
3173
3174   RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3175      (locinput - pref_len + pref0_len),
3176      pref_len - pref0_len, 60, 2, 3);
3177
3178   RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3179      locinput, loc_regeol - locinput, 10, 0, 1);
3180
3181   const STRLEN tlen=len0+len1+len2;
3182   PerlIO_printf(Perl_debug_log,
3183      "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3184      (IV)(locinput - loc_bostr),
3185      len0, s0,
3186      len1, s1,
3187      (docolor ? "" : "> <"),
3188      len2, s2,
3189      (int)(tlen > 19 ? 0 :  19 - tlen),
3190      "");
3191  }
3192 }
3193
3194 #endif
3195
3196 /* reg_check_named_buff_matched()
3197  * Checks to see if a named buffer has matched. The data array of
3198  * buffer numbers corresponding to the buffer is expected to reside
3199  * in the regexp->data->data array in the slot stored in the ARG() of
3200  * node involved. Note that this routine doesn't actually care about the
3201  * name, that information is not preserved from compilation to execution.
3202  * Returns the index of the leftmost defined buffer with the given name
3203  * or 0 if non of the buffers matched.
3204  */
3205 STATIC I32
3206 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3207 {
3208  I32 n;
3209  RXi_GET_DECL(rex,rexi);
3210  SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3211  I32 *nums=(I32*)SvPVX(sv_dat);
3212
3213  PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3214
3215  for ( n=0; n<SvIVX(sv_dat); n++ ) {
3216   if ((I32)rex->lastparen >= nums[n] &&
3217    rex->offs[nums[n]].end != -1)
3218   {
3219    return nums[n];
3220   }
3221  }
3222  return 0;
3223 }
3224
3225
3226 static bool
3227 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3228   U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3229 {
3230  /* This function determines if there are one or two characters that match
3231  * the first character of the passed-in EXACTish node <text_node>, and if
3232  * so, returns them in the passed-in pointers.
3233  *
3234  * If it determines that no possible character in the target string can
3235  * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3236  * the first character in <text_node> requires UTF-8 to represent, and the
3237  * target string isn't in UTF-8.)
3238  *
3239  * If there are more than two characters that could match the beginning of
3240  * <text_node>, or if more context is required to determine a match or not,
3241  * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3242  *
3243  * The motiviation behind this function is to allow the caller to set up
3244  * tight loops for matching.  If <text_node> is of type EXACT, there is
3245  * only one possible character that can match its first character, and so
3246  * the situation is quite simple.  But things get much more complicated if
3247  * folding is involved.  It may be that the first character of an EXACTFish
3248  * node doesn't participate in any possible fold, e.g., punctuation, so it
3249  * can be matched only by itself.  The vast majority of characters that are
3250  * in folds match just two things, their lower and upper-case equivalents.
3251  * But not all are like that; some have multiple possible matches, or match
3252  * sequences of more than one character.  This function sorts all that out.
3253  *
3254  * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3255  * loop of trying to match A*, we know we can't exit where the thing
3256  * following it isn't a B.  And something can't be a B unless it is the
3257  * beginning of B.  By putting a quick test for that beginning in a tight
3258  * loop, we can rule out things that can't possibly be B without having to
3259  * break out of the loop, thus avoiding work.  Similarly, if A is a single
3260  * character, we can make a tight loop matching A*, using the outputs of
3261  * this function.
3262  *
3263  * If the target string to match isn't in UTF-8, and there aren't
3264  * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3265  * the one or two possible octets (which are characters in this situation)
3266  * that can match.  In all cases, if there is only one character that can
3267  * match, *<c1p> and *<c2p> will be identical.
3268  *
3269  * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3270  * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3271  * can match the beginning of <text_node>.  They should be declared with at
3272  * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3273  * undefined what these contain.)  If one or both of the buffers are
3274  * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3275  * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3276  * *<c2p> will be set to a negative number(s) that shouldn't match any code
3277  * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3278  * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3279
3280  const bool utf8_target = reginfo->is_utf8_target;
3281
3282  UV c1 = CHRTEST_NOT_A_CP_1;
3283  UV c2 = CHRTEST_NOT_A_CP_2;
3284  bool use_chrtest_void = FALSE;
3285  const bool is_utf8_pat = reginfo->is_utf8_pat;
3286
3287  /* Used when we have both utf8 input and utf8 output, to avoid converting
3288  * to/from code points */
3289  bool utf8_has_been_setup = FALSE;
3290
3291  dVAR;
3292
3293  U8 *pat = (U8*)STRING(text_node);
3294
3295  if (OP(text_node) == EXACT) {
3296
3297   /* In an exact node, only one thing can be matched, that first
3298   * character.  If both the pat and the target are UTF-8, we can just
3299   * copy the input to the output, avoiding finding the code point of
3300   * that character */
3301   if (!is_utf8_pat) {
3302    c2 = c1 = *pat;
3303   }
3304   else if (utf8_target) {
3305    Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3306    Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3307    utf8_has_been_setup = TRUE;
3308   }
3309   else {
3310    c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3311   }
3312  }
3313  else /* an EXACTFish node */
3314   if ((is_utf8_pat
3315      && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3316              pat + STR_LEN(text_node)))
3317    || (!is_utf8_pat
3318      && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3319              pat + STR_LEN(text_node))))
3320  {
3321   /* Multi-character folds require more context to sort out.  Also
3322   * PL_utf8_foldclosures used below doesn't handle them, so have to be
3323   * handled outside this routine */
3324   use_chrtest_void = TRUE;
3325  }
3326  else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3327   c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3328   if (c1 > 256) {
3329    /* Load the folds hash, if not already done */
3330    SV** listp;
3331    if (! PL_utf8_foldclosures) {
3332     if (! PL_utf8_tofold) {
3333      U8 dummy[UTF8_MAXBYTES+1];
3334
3335      /* Force loading this by folding an above-Latin1 char */
3336      to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3337      assert(PL_utf8_tofold); /* Verify that worked */
3338     }
3339     PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3340    }
3341
3342    /* The fold closures data structure is a hash with the keys being
3343    * the UTF-8 of every character that is folded to, like 'k', and
3344    * the values each an array of all code points that fold to its
3345    * key.  e.g. [ 'k', 'K', KELVIN_SIGN ].  Multi-character folds are
3346    * not included */
3347    if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3348          (char *) pat,
3349          UTF8SKIP(pat),
3350          FALSE))))
3351    {
3352     /* Not found in the hash, therefore there are no folds
3353     * containing it, so there is only a single character that
3354     * could match */
3355     c2 = c1;
3356    }
3357    else {  /* Does participate in folds */
3358     AV* list = (AV*) *listp;
3359     if (av_len(list) != 1) {
3360
3361      /* If there aren't exactly two folds to this, it is outside
3362      * the scope of this function */
3363      use_chrtest_void = TRUE;
3364     }
3365     else {  /* There are two.  Get them */
3366      SV** c_p = av_fetch(list, 0, FALSE);
3367      if (c_p == NULL) {
3368       Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3369      }
3370      c1 = SvUV(*c_p);
3371
3372      c_p = av_fetch(list, 1, FALSE);
3373      if (c_p == NULL) {
3374       Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3375      }
3376      c2 = SvUV(*c_p);
3377
3378      /* Folds that cross the 255/256 boundary are forbidden if
3379      * EXACTFL, or EXACTFA and one is ASCIII.  Since the
3380      * pattern character is above 256, and its only other match
3381      * is below 256, the only legal match will be to itself.
3382      * We have thrown away the original, so have to compute
3383      * which is the one above 255 */
3384      if ((c1 < 256) != (c2 < 256)) {
3385       if (OP(text_node) == EXACTFL
3386        || (OP(text_node) == EXACTFA
3387         && (isASCII(c1) || isASCII(c2))))
3388       {
3389        if (c1 < 256) {
3390         c1 = c2;
3391        }
3392        else {
3393         c2 = c1;
3394        }
3395       }
3396      }
3397     }
3398    }
3399   }
3400   else /* Here, c1 is < 255 */
3401    if (utf8_target
3402     && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3403     && OP(text_node) != EXACTFL
3404     && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3405   {
3406    /* Here, there could be something above Latin1 in the target which
3407    * folds to this character in the pattern.  All such cases except
3408    * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3409    * involved in their folds, so are outside the scope of this
3410    * function */
3411    if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3412     c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3413    }
3414    else {
3415     use_chrtest_void = TRUE;
3416    }
3417   }
3418   else { /* Here nothing above Latin1 can fold to the pattern character */
3419    switch (OP(text_node)) {
3420
3421     case EXACTFL:   /* /l rules */
3422      c2 = PL_fold_locale[c1];
3423      break;
3424
3425     case EXACTF:
3426      if (! utf8_target) {    /* /d rules */
3427       c2 = PL_fold[c1];
3428       break;
3429      }
3430      /* FALLTHROUGH */
3431      /* /u rules for all these.  This happens to work for
3432      * EXACTFA as nothing in Latin1 folds to ASCII */
3433     case EXACTFA:
3434     case EXACTFU_TRICKYFOLD:
3435     case EXACTFU_SS:
3436     case EXACTFU:
3437      c2 = PL_fold_latin1[c1];
3438      break;
3439
3440     default:
3441      Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3442      assert(0); /* NOTREACHED */
3443    }
3444   }
3445  }
3446
3447  /* Here have figured things out.  Set up the returns */
3448  if (use_chrtest_void) {
3449   *c2p = *c1p = CHRTEST_VOID;
3450  }
3451  else if (utf8_target) {
3452   if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3453    uvchr_to_utf8(c1_utf8, c1);
3454    uvchr_to_utf8(c2_utf8, c2);
3455   }
3456
3457   /* Invariants are stored in both the utf8 and byte outputs; Use
3458   * negative numbers otherwise for the byte ones.  Make sure that the
3459   * byte ones are the same iff the utf8 ones are the same */
3460   *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3461   *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3462     ? *c2_utf8
3463     : (c1 == c2)
3464     ? CHRTEST_NOT_A_CP_1
3465     : CHRTEST_NOT_A_CP_2;
3466  }
3467  else if (c1 > 255) {
3468  if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3469       can represent */
3470   return FALSE;
3471  }
3472
3473  *c1p = *c2p = c2;    /* c2 is the only representable value */
3474  }
3475  else {  /* c1 is representable; see about c2 */
3476  *c1p = c1;
3477  *c2p = (c2 < 256) ? c2 : c1;
3478  }
3479
3480  return TRUE;
3481 }
3482
3483 /* returns -1 on failure, $+[0] on success */
3484 STATIC I32
3485 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3486 {
3487 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3488  dMY_CXT;
3489 #endif
3490  dVAR;
3491  const bool utf8_target = reginfo->is_utf8_target;
3492  const U32 uniflags = UTF8_ALLOW_DEFAULT;
3493  REGEXP *rex_sv = reginfo->prog;
3494  regexp *rex = ReANY(rex_sv);
3495  RXi_GET_DECL(rex,rexi);
3496  /* the current state. This is a cached copy of PL_regmatch_state */
3497  regmatch_state *st;
3498  /* cache heavy used fields of st in registers */
3499  regnode *scan;
3500  regnode *next;
3501  U32 n = 0; /* general value; init to avoid compiler warning */
3502  I32 ln = 0; /* len or last;  init to avoid compiler warning */
3503  char *locinput = startpos;
3504  char *pushinput; /* where to continue after a PUSH */
3505  I32 nextchr;   /* is always set to UCHARAT(locinput) */
3506
3507  bool result = 0;     /* return value of S_regmatch */
3508  int depth = 0;     /* depth of backtrack stack */
3509  U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3510  const U32 max_nochange_depth =
3511   (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3512   3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3513  regmatch_state *yes_state = NULL; /* state to pop to on success of
3514                subpattern */
3515  /* mark_state piggy backs on the yes_state logic so that when we unwind
3516  the stack on success we can update the mark_state as we go */
3517  regmatch_state *mark_state = NULL; /* last mark state we have seen */
3518  regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3519  struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3520  U32 state_num;
3521  bool no_final = 0;      /* prevent failure from backtracking? */
3522  bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3523  char *startpoint = locinput;
3524  SV *popmark = NULL;     /* are we looking for a mark? */
3525  SV *sv_commit = NULL;   /* last mark name seen in failure */
3526  SV *sv_yes_mark = NULL; /* last mark name we have seen
3527        during a successful match */
3528  U32 lastopen = 0;       /* last open we saw */
3529  bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3530  SV* const oreplsv = GvSV(PL_replgv);
3531  /* these three flags are set by various ops to signal information to
3532  * the very next op. They have a useful lifetime of exactly one loop
3533  * iteration, and are not preserved or restored by state pushes/pops
3534  */
3535  bool sw = 0;     /* the condition value in (?(cond)a|b) */
3536  bool minmod = 0;     /* the next "{n,m}" is a "{n,m}?" */
3537  int logical = 0;     /* the following EVAL is:
3538         0: (?{...})
3539         1: (?(?{...})X|Y)
3540         2: (??{...})
3541        or the following IFMATCH/UNLESSM is:
3542         false: plain (?=foo)
3543         true:  used as a condition: (?(?=foo))
3544        */
3545  PAD* last_pad = NULL;
3546  dMULTICALL;
3547  I32 gimme = G_SCALAR;
3548  CV *caller_cv = NULL; /* who called us */
3549  CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3550  CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3551  U32 maxopenparen = 0;       /* max '(' index seen so far */
3552  int to_complement;  /* Invert the result? */
3553  _char_class_number classnum;
3554  bool is_utf8_pat = reginfo->is_utf8_pat;
3555
3556 #ifdef DEBUGGING
3557  GET_RE_DEBUG_FLAGS_DECL;
3558 #endif
3559
3560  /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3561  multicall_oldcatch = 0;
3562  multicall_cv = NULL;
3563  cx = NULL;
3564  PERL_UNUSED_VAR(multicall_cop);
3565  PERL_UNUSED_VAR(newsp);
3566
3567
3568  PERL_ARGS_ASSERT_REGMATCH;
3569
3570  DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3571    PerlIO_printf(Perl_debug_log,"regmatch start\n");
3572  }));
3573
3574  st = PL_regmatch_state;
3575
3576  /* Note that nextchr is a byte even in UTF */
3577  SET_nextchr;
3578  scan = prog;
3579  while (scan != NULL) {
3580
3581   DEBUG_EXECUTE_r( {
3582    SV * const prop = sv_newmortal();
3583    regnode *rnext=regnext(scan);
3584    DUMP_EXEC_POS( locinput, scan, utf8_target );
3585    regprop(rex, prop, scan);
3586
3587    PerlIO_printf(Perl_debug_log,
3588      "%3"IVdf":%*s%s(%"IVdf")\n",
3589      (IV)(scan - rexi->program), depth*2, "",
3590      SvPVX_const(prop),
3591      (PL_regkind[OP(scan)] == END || !rnext) ?
3592       0 : (IV)(rnext - rexi->program));
3593   });
3594
3595   next = scan + NEXT_OFF(scan);
3596   if (next == scan)
3597    next = NULL;
3598   state_num = OP(scan);
3599
3600   REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3601  reenter_switch:
3602   to_complement = 0;
3603
3604   SET_nextchr;
3605   assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3606
3607   switch (state_num) {
3608   case BOL: /*  /^../  */
3609    if (locinput == reginfo->strbeg)
3610     break;
3611    sayNO;
3612
3613   case MBOL: /*  /^../m  */
3614    if (locinput == reginfo->strbeg ||
3615     (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3616    {
3617     break;
3618    }
3619    sayNO;
3620
3621   case SBOL: /*  /^../s  */
3622    if (locinput == reginfo->strbeg)
3623     break;
3624    sayNO;
3625
3626   case GPOS: /*  \G  */
3627    if (locinput == reginfo->ganch)
3628     break;
3629    sayNO;
3630
3631   case KEEPS: /*   \K  */
3632    /* update the startpoint */
3633    st->u.keeper.val = rex->offs[0].start;
3634    rex->offs[0].start = locinput - reginfo->strbeg;
3635    PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3636    assert(0); /*NOTREACHED*/
3637   case KEEPS_next_fail:
3638    /* rollback the start point change */
3639    rex->offs[0].start = st->u.keeper.val;
3640    sayNO_SILENT;
3641    assert(0); /*NOTREACHED*/
3642
3643   case EOL: /* /..$/  */
3644     goto seol;
3645
3646   case MEOL: /* /..$/m  */
3647    if (!NEXTCHR_IS_EOS && nextchr != '\n')
3648     sayNO;
3649    break;
3650
3651   case SEOL: /* /..$/s  */
3652   seol:
3653    if (!NEXTCHR_IS_EOS && nextchr != '\n')
3654     sayNO;
3655    if (reginfo->strend - locinput > 1)
3656     sayNO;
3657    break;
3658
3659   case EOS: /*  \z  */
3660    if (!NEXTCHR_IS_EOS)
3661     sayNO;
3662    break;
3663
3664   case SANY: /*  /./s  */
3665    if (NEXTCHR_IS_EOS)
3666     sayNO;
3667    goto increment_locinput;
3668
3669   case CANY: /*  \C  */
3670    if (NEXTCHR_IS_EOS)
3671     sayNO;
3672    locinput++;
3673    break;
3674
3675   case REG_ANY: /*  /./  */
3676    if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3677     sayNO;
3678    goto increment_locinput;
3679
3680
3681 #undef  ST
3682 #define ST st->u.trie
3683   case TRIEC: /* (ab|cd) with known charclass */
3684    /* In this case the charclass data is available inline so
3685    we can fail fast without a lot of extra overhead.
3686    */
3687    if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3688     DEBUG_EXECUTE_r(
3689      PerlIO_printf(Perl_debug_log,
3690        "%*s  %sfailed to match trie start class...%s\n",
3691        REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3692     );
3693     sayNO_SILENT;
3694     assert(0); /* NOTREACHED */
3695    }
3696    /* FALL THROUGH */
3697   case TRIE:  /* (ab|cd)  */
3698    /* the basic plan of execution of the trie is:
3699    * At the beginning, run though all the states, and
3700    * find the longest-matching word. Also remember the position
3701    * of the shortest matching word. For example, this pattern:
3702    *    1  2 3 4    5
3703    *    ab|a|x|abcd|abc
3704    * when matched against the string "abcde", will generate
3705    * accept states for all words except 3, with the longest
3706    * matching word being 4, and the shortest being 2 (with
3707    * the position being after char 1 of the string).
3708    *
3709    * Then for each matching word, in word order (i.e. 1,2,4,5),
3710    * we run the remainder of the pattern; on each try setting
3711    * the current position to the character following the word,
3712    * returning to try the next word on failure.
3713    *
3714    * We avoid having to build a list of words at runtime by
3715    * using a compile-time structure, wordinfo[].prev, which
3716    * gives, for each word, the previous accepting word (if any).
3717    * In the case above it would contain the mappings 1->2, 2->0,
3718    * 3->0, 4->5, 5->1.  We can use this table to generate, from
3719    * the longest word (4 above), a list of all words, by
3720    * following the list of prev pointers; this gives us the
3721    * unordered list 4,5,1,2. Then given the current word we have
3722    * just tried, we can go through the list and find the
3723    * next-biggest word to try (so if we just failed on word 2,
3724    * the next in the list is 4).
3725    *
3726    * Since at runtime we don't record the matching position in
3727    * the string for each word, we have to work that out for
3728    * each word we're about to process. The wordinfo table holds
3729    * the character length of each word; given that we recorded
3730    * at the start: the position of the shortest word and its
3731    * length in chars, we just need to move the pointer the
3732    * difference between the two char lengths. Depending on
3733    * Unicode status and folding, that's cheap or expensive.
3734    *
3735    * This algorithm is optimised for the case where are only a
3736    * small number of accept states, i.e. 0,1, or maybe 2.
3737    * With lots of accepts states, and having to try all of them,
3738    * it becomes quadratic on number of accept states to find all
3739    * the next words.
3740    */
3741
3742    {
3743     /* what type of TRIE am I? (utf8 makes this contextual) */
3744     DECL_TRIE_TYPE(scan);
3745
3746     /* what trie are we using right now */
3747     reg_trie_data * const trie
3748      = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3749     HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3750     U32 state = trie->startstate;
3751
3752     if (   trie->bitmap
3753      && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3754     {
3755      if (trie->states[ state ].wordnum) {
3756       DEBUG_EXECUTE_r(
3757        PerlIO_printf(Perl_debug_log,
3758           "%*s  %smatched empty string...%s\n",
3759           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3760       );
3761       if (!trie->jump)
3762        break;
3763      } else {
3764       DEBUG_EXECUTE_r(
3765        PerlIO_printf(Perl_debug_log,
3766           "%*s  %sfailed to match trie start class...%s\n",
3767           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3768       );
3769       sayNO_SILENT;
3770     }
3771     }
3772
3773    {
3774     U8 *uc = ( U8* )locinput;
3775
3776     STRLEN len = 0;
3777     STRLEN foldlen = 0;
3778     U8 *uscan = (U8*)NULL;
3779     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3780     U32 charcount = 0; /* how many input chars we have matched */
3781     U32 accepted = 0; /* have we seen any accepting states? */
3782
3783     ST.jump = trie->jump;
3784     ST.me = scan;
3785     ST.firstpos = NULL;
3786     ST.longfold = FALSE; /* char longer if folded => it's harder */
3787     ST.nextword = 0;
3788
3789     /* fully traverse the TRIE; note the position of the
3790     shortest accept state and the wordnum of the longest
3791     accept state */
3792
3793     while ( state && uc <= (U8*)(reginfo->strend) ) {
3794      U32 base = trie->states[ state ].trans.base;
3795      UV uvc = 0;
3796      U16 charid = 0;
3797      U16 wordnum;
3798      wordnum = trie->states[ state ].wordnum;
3799
3800      if (wordnum) { /* it's an accept state */
3801       if (!accepted) {
3802        accepted = 1;
3803        /* record first match position */
3804        if (ST.longfold) {
3805         ST.firstpos = (U8*)locinput;
3806         ST.firstchars = 0;
3807        }
3808        else {
3809         ST.firstpos = uc;
3810         ST.firstchars = charcount;
3811        }
3812       }
3813       if (!ST.nextword || wordnum < ST.nextword)
3814        ST.nextword = wordnum;
3815       ST.topword = wordnum;
3816      }
3817
3818      DEBUG_TRIE_EXECUTE_r({
3819         DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3820         PerlIO_printf( Perl_debug_log,
3821          "%*s  %sState: %4"UVxf" Accepted: %c ",
3822          2+depth * 2, "", PL_colors[4],
3823          (UV)state, (accepted ? 'Y' : 'N'));
3824      });
3825
3826      /* read a char and goto next state */
3827      if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
3828       I32 offset;
3829       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3830            uscan, len, uvc, charid, foldlen,
3831            foldbuf, uniflags);
3832       charcount++;
3833       if (foldlen>0)
3834        ST.longfold = TRUE;
3835       if (charid &&
3836        ( ((offset =
3837        base + charid - 1 - trie->uniquecharcount)) >= 0)
3838
3839        && ((U32)offset < trie->lasttrans)
3840        && trie->trans[offset].check == state)
3841       {
3842        state = trie->trans[offset].next;
3843       }
3844       else {
3845        state = 0;
3846       }
3847       uc += len;
3848
3849      }
3850      else {
3851       state = 0;
3852      }
3853      DEBUG_TRIE_EXECUTE_r(
3854       PerlIO_printf( Perl_debug_log,
3855        "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3856        charid, uvc, (UV)state, PL_colors[5] );
3857      );
3858     }
3859     if (!accepted)
3860     sayNO;
3861
3862     /* calculate total number of accept states */
3863     {
3864      U16 w = ST.topword;
3865      accepted = 0;
3866      while (w) {
3867       w = trie->wordinfo[w].prev;
3868       accepted++;
3869      }
3870      ST.accepted = accepted;
3871     }
3872
3873     DEBUG_EXECUTE_r(
3874      PerlIO_printf( Perl_debug_log,
3875       "%*s  %sgot %"IVdf" possible matches%s\n",
3876       REPORT_CODE_OFF + depth * 2, "",
3877       PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3878     );
3879     goto trie_first_try; /* jump into the fail handler */
3880    }}
3881    assert(0); /* NOTREACHED */
3882
3883   case TRIE_next_fail: /* we failed - try next alternative */
3884   {
3885    U8 *uc;
3886    if ( ST.jump) {
3887     REGCP_UNWIND(ST.cp);
3888     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3889    }
3890    if (!--ST.accepted) {
3891     DEBUG_EXECUTE_r({
3892      PerlIO_printf( Perl_debug_log,
3893       "%*s  %sTRIE failed...%s\n",
3894       REPORT_CODE_OFF+depth*2, "",
3895       PL_colors[4],
3896       PL_colors[5] );
3897     });
3898     sayNO_SILENT;
3899    }
3900    {
3901     /* Find next-highest word to process.  Note that this code
3902     * is O(N^2) per trie run (O(N) per branch), so keep tight */
3903     U16 min = 0;
3904     U16 word;
3905     U16 const nextword = ST.nextword;
3906     reg_trie_wordinfo * const wordinfo
3907      = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3908     for (word=ST.topword; word; word=wordinfo[word].prev) {
3909      if (word > nextword && (!min || word < min))
3910       min = word;
3911     }
3912     ST.nextword = min;
3913    }
3914
3915   trie_first_try:
3916    if (do_cutgroup) {
3917     do_cutgroup = 0;
3918     no_final = 0;
3919    }
3920
3921    if ( ST.jump) {
3922     ST.lastparen = rex->lastparen;
3923     ST.lastcloseparen = rex->lastcloseparen;
3924     REGCP_SET(ST.cp);
3925    }
3926
3927    /* find start char of end of current word */
3928    {
3929     U32 chars; /* how many chars to skip */
3930     reg_trie_data * const trie
3931      = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3932
3933     assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3934        >=  ST.firstchars);
3935     chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3936        - ST.firstchars;
3937     uc = ST.firstpos;
3938
3939     if (ST.longfold) {
3940      /* the hard option - fold each char in turn and find
3941      * its folded length (which may be different */
3942      U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3943      STRLEN foldlen;
3944      STRLEN len;
3945      UV uvc;
3946      U8 *uscan;
3947
3948      while (chars) {
3949       if (utf8_target) {
3950        uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3951              uniflags);
3952        uc += len;
3953       }
3954       else {
3955        uvc = *uc;
3956        uc++;
3957       }
3958       uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3959       uscan = foldbuf;
3960       while (foldlen) {
3961        if (!--chars)
3962         break;
3963        uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3964            uniflags);
3965        uscan += len;
3966        foldlen -= len;
3967       }
3968      }
3969     }
3970     else {
3971      if (utf8_target)
3972       while (chars--)
3973        uc += UTF8SKIP(uc);
3974      else
3975       uc += chars;
3976     }
3977    }
3978
3979    scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3980        ? ST.jump[ST.nextword]
3981        : NEXT_OFF(ST.me));
3982
3983    DEBUG_EXECUTE_r({
3984     PerlIO_printf( Perl_debug_log,
3985      "%*s  %sTRIE matched word #%d, continuing%s\n",
3986      REPORT_CODE_OFF+depth*2, "",
3987      PL_colors[4],
3988      ST.nextword,
3989      PL_colors[5]
3990      );
3991    });
3992
3993    if (ST.accepted > 1 || has_cutgroup) {
3994     PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
3995     assert(0); /* NOTREACHED */
3996    }
3997    /* only one choice left - just continue */
3998    DEBUG_EXECUTE_r({
3999     AV *const trie_words
4000      = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4001     SV ** const tmp = av_fetch( trie_words,
4002      ST.nextword-1, 0 );
4003     SV *sv= tmp ? sv_newmortal() : NULL;
4004
4005     PerlIO_printf( Perl_debug_log,
4006      "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4007      REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4008      ST.nextword,
4009      tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4010        PL_colors[0], PL_colors[1],
4011        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4012       )
4013      : "not compiled under -Dr",
4014      PL_colors[5] );
4015    });
4016
4017    locinput = (char*)uc;
4018    continue; /* execute rest of RE */
4019    assert(0); /* NOTREACHED */
4020   }
4021 #undef  ST
4022
4023   case EXACT: {            /*  /abc/        */
4024    char *s = STRING(scan);
4025    ln = STR_LEN(scan);
4026    if (utf8_target != is_utf8_pat) {
4027     /* The target and the pattern have differing utf8ness. */
4028     char *l = locinput;
4029     const char * const e = s + ln;
4030
4031     if (utf8_target) {
4032      /* The target is utf8, the pattern is not utf8.
4033      * Above-Latin1 code points can't match the pattern;
4034      * invariants match exactly, and the other Latin1 ones need
4035      * to be downgraded to a single byte in order to do the
4036      * comparison.  (If we could be confident that the target
4037      * is not malformed, this could be refactored to have fewer
4038      * tests by just assuming that if the first bytes match, it
4039      * is an invariant, but there are tests in the test suite
4040      * dealing with (??{...}) which violate this) */
4041      while (s < e) {
4042       if (l >= reginfo->strend
4043        || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4044       {
4045        sayNO;
4046       }
4047       if (UTF8_IS_INVARIANT(*(U8*)l)) {
4048        if (*l != *s) {
4049         sayNO;
4050        }
4051        l++;
4052       }
4053       else {
4054        if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4055         sayNO;
4056        }
4057        l += 2;
4058       }
4059       s++;
4060      }
4061     }
4062     else {
4063      /* The target is not utf8, the pattern is utf8. */
4064      while (s < e) {
4065       if (l >= reginfo->strend
4066        || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4067       {
4068        sayNO;
4069       }
4070       if (UTF8_IS_INVARIANT(*(U8*)s)) {
4071        if (*s != *l) {
4072         sayNO;
4073        }
4074        s++;
4075       }
4076       else {
4077        if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4078         sayNO;
4079        }
4080        s += 2;
4081       }
4082       l++;
4083      }
4084     }
4085     locinput = l;
4086    }
4087    else {
4088     /* The target and the pattern have the same utf8ness. */
4089     /* Inline the first character, for speed. */
4090     if (reginfo->strend - locinput < ln
4091      || UCHARAT(s) != nextchr
4092      || (ln > 1 && memNE(s, locinput, ln)))
4093     {
4094      sayNO;
4095     }
4096     locinput += ln;
4097    }
4098    break;
4099    }
4100
4101   case EXACTFL: {          /*  /abc/il      */
4102    re_fold_t folder;
4103    const U8 * fold_array;
4104    const char * s;
4105    U32 fold_utf8_flags;
4106
4107    RX_MATCH_TAINTED_on(reginfo->prog);
4108    folder = foldEQ_locale;
4109    fold_array = PL_fold_locale;
4110    fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4111    goto do_exactf;
4112
4113   case EXACTFU_SS:         /*  /\x{df}/iu   */
4114   case EXACTFU_TRICKYFOLD: /*  /\x{390}/iu  */
4115   case EXACTFU:            /*  /abc/iu      */
4116    folder = foldEQ_latin1;
4117    fold_array = PL_fold_latin1;
4118    fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4119    goto do_exactf;
4120
4121   case EXACTFA:            /*  /abc/iaa     */
4122    folder = foldEQ_latin1;
4123    fold_array = PL_fold_latin1;
4124    fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4125    goto do_exactf;
4126
4127   case EXACTF:             /*  /abc/i       */
4128    folder = foldEQ;
4129    fold_array = PL_fold;
4130    fold_utf8_flags = 0;
4131
4132   do_exactf:
4133    s = STRING(scan);
4134    ln = STR_LEN(scan);
4135
4136    if (utf8_target || is_utf8_pat || state_num == EXACTFU_SS) {
4137    /* Either target or the pattern are utf8, or has the issue where
4138    * the fold lengths may differ. */
4139     const char * const l = locinput;
4140     char *e = reginfo->strend;
4141
4142     if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4143           l, &e, 0,  utf8_target, fold_utf8_flags))
4144     {
4145      sayNO;
4146     }
4147     locinput = e;
4148     break;
4149    }
4150
4151    /* Neither the target nor the pattern are utf8 */
4152    if (UCHARAT(s) != nextchr
4153     && !NEXTCHR_IS_EOS
4154     && UCHARAT(s) != fold_array[nextchr])
4155    {
4156     sayNO;
4157    }
4158    if (reginfo->strend - locinput < ln)
4159     sayNO;
4160    if (ln > 1 && ! folder(s, locinput, ln))
4161     sayNO;
4162    locinput += ln;
4163    break;
4164   }
4165
4166   /* XXX Could improve efficiency by separating these all out using a
4167   * macro or in-line function.  At that point regcomp.c would no longer
4168   * have to set the FLAGS fields of these */
4169   case BOUNDL:  /*  /\b/l  */
4170   case NBOUNDL: /*  /\B/l  */
4171    RX_MATCH_TAINTED_on(reginfo->prog);
4172    /* FALL THROUGH */
4173   case BOUND:   /*  /\b/   */
4174   case BOUNDU:  /*  /\b/u  */
4175   case BOUNDA:  /*  /\b/a  */
4176   case NBOUND:  /*  /\B/   */
4177   case NBOUNDU: /*  /\B/u  */
4178   case NBOUNDA: /*  /\B/a  */
4179    /* was last char in word? */
4180    if (utf8_target
4181     && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4182     && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4183    {
4184     if (locinput == reginfo->strbeg)
4185      ln = '\n';
4186     else {
4187      const U8 * const r =
4188        reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4189
4190      ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4191     }
4192     if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4193      ln = isWORDCHAR_uni(ln);
4194      if (NEXTCHR_IS_EOS)
4195       n = 0;
4196      else {
4197       LOAD_UTF8_CHARCLASS_ALNUM();
4198       n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4199                 utf8_target);
4200      }
4201     }
4202     else {
4203      ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
4204      n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4205     }
4206    }
4207    else {
4208
4209     /* Here the string isn't utf8, or is utf8 and only ascii
4210     * characters are to match \w.  In the latter case looking at
4211     * the byte just prior to the current one may be just the final
4212     * byte of a multi-byte character.  This is ok.  There are two
4213     * cases:
4214     * 1) it is a single byte character, and then the test is doing
4215     * just what it's supposed to.
4216     * 2) it is a multi-byte character, in which case the final
4217     * byte is never mistakable for ASCII, and so the test
4218     * will say it is not a word character, which is the
4219     * correct answer. */
4220     ln = (locinput != reginfo->strbeg) ?
4221      UCHARAT(locinput - 1) : '\n';
4222     switch (FLAGS(scan)) {
4223      case REGEX_UNICODE_CHARSET:
4224       ln = isWORDCHAR_L1(ln);
4225       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4226       break;
4227      case REGEX_LOCALE_CHARSET:
4228       ln = isWORDCHAR_LC(ln);
4229       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4230       break;
4231      case REGEX_DEPENDS_CHARSET:
4232       ln = isWORDCHAR(ln);
4233       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4234       break;
4235      case REGEX_ASCII_RESTRICTED_CHARSET:
4236      case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4237       ln = isWORDCHAR_A(ln);
4238       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4239       break;
4240      default:
4241       Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4242       break;
4243     }
4244    }
4245    /* Note requires that all BOUNDs be lower than all NBOUNDs in
4246    * regcomp.sym */
4247    if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4248      sayNO;
4249    break;
4250
4251   case ANYOF:  /*  /[abc]/       */
4252   case ANYOF_WARN_SUPER:
4253    if (NEXTCHR_IS_EOS)
4254     sayNO;
4255    if (utf8_target) {
4256     if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4257      sayNO;
4258     locinput += UTF8SKIP(locinput);
4259    }
4260    else {
4261     if (!REGINCLASS(rex, scan, (U8*)locinput))
4262      sayNO;
4263     locinput++;
4264    }
4265    break;
4266
4267   /* The argument (FLAGS) to all the POSIX node types is the class number
4268   * */
4269
4270   case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4271    to_complement = 1;
4272    /* FALLTHROUGH */
4273
4274   case POSIXL:    /* \w or [:punct:] etc. under /l */
4275    if (NEXTCHR_IS_EOS)
4276     sayNO;
4277
4278    /* The locale hasn't influenced the outcome before this, so defer
4279    * tainting until now */
4280    RX_MATCH_TAINTED_on(reginfo->prog);
4281
4282    /* Use isFOO_lc() for characters within Latin1.  (Note that
4283    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4284    * wouldn't be invariant) */
4285    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4286     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4287      sayNO;
4288     }
4289    }
4290    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4291     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4292           (U8) TWO_BYTE_UTF8_TO_UNI(nextchr,
4293                *(locinput + 1))))))
4294     {
4295      sayNO;
4296     }
4297    }
4298    else { /* Here, must be an above Latin-1 code point */
4299     goto utf8_posix_not_eos;
4300    }
4301
4302    /* Here, must be utf8 */
4303    locinput += UTF8SKIP(locinput);
4304    break;
4305
4306   case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4307    to_complement = 1;
4308    /* FALLTHROUGH */
4309
4310   case POSIXD:    /* \w or [:punct:] etc. under /d */
4311    if (utf8_target) {
4312     goto utf8_posix;
4313    }
4314    goto posixa;
4315
4316   case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4317
4318    if (NEXTCHR_IS_EOS) {
4319     sayNO;
4320    }
4321
4322    /* All UTF-8 variants match */
4323    if (! UTF8_IS_INVARIANT(nextchr)) {
4324     goto increment_locinput;
4325    }
4326
4327    to_complement = 1;
4328    /* FALLTHROUGH */
4329
4330   case POSIXA:    /* \w or [:punct:] etc. under /a */
4331
4332   posixa:
4333    /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4334    * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4335    * character is a single byte */
4336
4337    if (NEXTCHR_IS_EOS
4338     || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4339                FLAGS(scan)))))
4340    {
4341     sayNO;
4342    }
4343
4344    /* Here we are either not in utf8, or we matched a utf8-invariant,
4345    * so the next char is the next byte */
4346    locinput++;
4347    break;
4348
4349   case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4350    to_complement = 1;
4351    /* FALLTHROUGH */
4352
4353   case POSIXU:    /* \w or [:punct:] etc. under /u */
4354   utf8_posix:
4355    if (NEXTCHR_IS_EOS) {
4356     sayNO;
4357    }
4358   utf8_posix_not_eos:
4359
4360    /* Use _generic_isCC() for characters within Latin1.  (Note that
4361    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4362    * wouldn't be invariant) */
4363    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4364     if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4365               FLAGS(scan)))))
4366     {
4367      sayNO;
4368     }
4369     locinput++;
4370    }
4371    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4372     if (! (to_complement
4373      ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr,
4374                *(locinput + 1)),
4375            FLAGS(scan)))))
4376     {
4377      sayNO;
4378     }
4379     locinput += 2;
4380    }
4381    else {  /* Handle above Latin-1 code points */
4382     classnum = (_char_class_number) FLAGS(scan);
4383     if (classnum < _FIRST_NON_SWASH_CC) {
4384
4385      /* Here, uses a swash to find such code points.  Load if if
4386      * not done already */
4387      if (! PL_utf8_swash_ptrs[classnum]) {
4388       U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4389       PL_utf8_swash_ptrs[classnum]
4390         = _core_swash_init("utf8",
4391           swash_property_names[classnum],
4392           &PL_sv_undef, 1, 0, NULL, &flags);
4393      }
4394      if (! (to_complement
4395       ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4396            (U8 *) locinput, TRUE))))
4397      {
4398       sayNO;
4399      }
4400     }
4401     else {  /* Here, uses macros to find above Latin-1 code points */
4402      switch (classnum) {
4403       case _CC_ENUM_SPACE:    /* XXX would require separate
4404             code if we revert the change
4405             of \v matching this */
4406       case _CC_ENUM_PSXSPC:
4407        if (! (to_complement
4408           ^ cBOOL(is_XPERLSPACE_high(locinput))))
4409        {
4410         sayNO;
4411        }
4412        break;
4413       case _CC_ENUM_BLANK:
4414        if (! (to_complement
4415            ^ cBOOL(is_HORIZWS_high(locinput))))
4416        {
4417         sayNO;
4418        }
4419        break;
4420       case _CC_ENUM_XDIGIT:
4421        if (! (to_complement
4422            ^ cBOOL(is_XDIGIT_high(locinput))))
4423        {
4424         sayNO;
4425        }
4426        break;
4427       case _CC_ENUM_VERTSPACE:
4428        if (! (to_complement
4429            ^ cBOOL(is_VERTWS_high(locinput))))
4430        {
4431         sayNO;
4432        }
4433        break;
4434       default:    /* The rest, e.g. [:cntrl:], can't match
4435          above Latin1 */
4436        if (! to_complement) {
4437         sayNO;
4438        }
4439        break;
4440      }
4441     }
4442     locinput += UTF8SKIP(locinput);
4443    }
4444    break;
4445
4446   case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4447      a Unicode extended Grapheme Cluster */
4448    /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4449    extended Grapheme Cluster is:
4450
4451    CR LF
4452    | Prepend* Begin Extend*
4453    | .
4454
4455    Begin is:           ( Special_Begin | ! Control )
4456    Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4457    Extend is:          ( Grapheme_Extend | Spacing_Mark )
4458    Control is:         [ GCB_Control | CR | LF ]
4459    Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4460
4461    If we create a 'Regular_Begin' = Begin - Special_Begin, then
4462    we can rewrite
4463
4464     Begin is ( Regular_Begin + Special Begin )
4465
4466    It turns out that 98.4% of all Unicode code points match
4467    Regular_Begin.  Doing it this way eliminates a table match in
4468    the previous implementation for almost all Unicode code points.
4469
4470    There is a subtlety with Prepend* which showed up in testing.
4471    Note that the Begin, and only the Begin is required in:
4472     | Prepend* Begin Extend*
4473    Also, Begin contains '! Control'.  A Prepend must be a
4474    '!  Control', which means it must also be a Begin.  What it
4475    comes down to is that if we match Prepend* and then find no
4476    suitable Begin afterwards, that if we backtrack the last
4477    Prepend, that one will be a suitable Begin.
4478    */
4479
4480    if (NEXTCHR_IS_EOS)
4481     sayNO;
4482    if  (! utf8_target) {
4483
4484     /* Match either CR LF  or '.', as all the other possibilities
4485     * require utf8 */
4486     locinput++;     /* Match the . or CR */
4487     if (nextchr == '\r' /* And if it was CR, and the next is LF,
4488          match the LF */
4489      && locinput < reginfo->strend
4490      && UCHARAT(locinput) == '\n')
4491     {
4492      locinput++;
4493     }
4494    }
4495    else {
4496
4497     /* Utf8: See if is ( CR LF ); already know that locinput <
4498     * reginfo->strend, so locinput+1 is in bounds */
4499     if ( nextchr == '\r' && locinput+1 < reginfo->strend
4500      && UCHARAT(locinput + 1) == '\n')
4501     {
4502      locinput += 2;
4503     }
4504     else {
4505      STRLEN len;
4506
4507      /* In case have to backtrack to beginning, then match '.' */
4508      char *starting = locinput;
4509
4510      /* In case have to backtrack the last prepend */
4511      char *previous_prepend = NULL;
4512
4513      LOAD_UTF8_CHARCLASS_GCB();
4514
4515      /* Match (prepend)*   */
4516      while (locinput < reginfo->strend
4517       && (len = is_GCB_Prepend_utf8(locinput)))
4518      {
4519       previous_prepend = locinput;
4520       locinput += len;
4521      }
4522
4523      /* As noted above, if we matched a prepend character, but
4524      * the next thing won't match, back off the last prepend we
4525      * matched, as it is guaranteed to match the begin */
4526      if (previous_prepend
4527       && (locinput >=  reginfo->strend
4528        || (! swash_fetch(PL_utf8_X_regular_begin,
4529            (U8*)locinput, utf8_target)
4530         && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4531       )
4532      {
4533       locinput = previous_prepend;
4534      }
4535
4536      /* Note that here we know reginfo->strend > locinput, as we
4537      * tested that upon input to this switch case, and if we
4538      * moved locinput forward, we tested the result just above
4539      * and it either passed, or we backed off so that it will
4540      * now pass */
4541      if (swash_fetch(PL_utf8_X_regular_begin,
4542          (U8*)locinput, utf8_target)) {
4543       locinput += UTF8SKIP(locinput);
4544      }
4545      else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4546
4547       /* Here did not match the required 'Begin' in the
4548       * second term.  So just match the very first
4549       * character, the '.' of the final term of the regex */
4550       locinput = starting + UTF8SKIP(starting);
4551       goto exit_utf8;
4552      } else {
4553
4554       /* Here is a special begin.  It can be composed of
4555       * several individual characters.  One possibility is
4556       * RI+ */
4557       if ((len = is_GCB_RI_utf8(locinput))) {
4558        locinput += len;
4559        while (locinput < reginfo->strend
4560         && (len = is_GCB_RI_utf8(locinput)))
4561        {
4562         locinput += len;
4563        }
4564       } else if ((len = is_GCB_T_utf8(locinput))) {
4565        /* Another possibility is T+ */
4566        locinput += len;
4567        while (locinput < reginfo->strend
4568         && (len = is_GCB_T_utf8(locinput)))
4569        {
4570         locinput += len;
4571        }
4572       } else {
4573
4574        /* Here, neither RI+ nor T+; must be some other
4575        * Hangul.  That means it is one of the others: L,
4576        * LV, LVT or V, and matches:
4577        * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4578
4579        /* Match L*           */
4580        while (locinput < reginfo->strend
4581         && (len = is_GCB_L_utf8(locinput)))
4582        {
4583         locinput += len;
4584        }
4585
4586        /* Here, have exhausted L*.  If the next character
4587        * is not an LV, LVT nor V, it means we had to have
4588        * at least one L, so matches L+ in the original
4589        * equation, we have a complete hangul syllable.
4590        * Are done. */
4591
4592        if (locinput < reginfo->strend
4593         && is_GCB_LV_LVT_V_utf8(locinput))
4594        {
4595         /* Otherwise keep going.  Must be LV, LVT or V.
4596         * See if LVT, by first ruling out V, then LV */
4597         if (! is_GCB_V_utf8(locinput)
4598           /* All but every TCount one is LV */
4599          && (valid_utf8_to_uvchr((U8 *) locinput,
4600                   NULL)
4601                   - SBASE)
4602           % TCount != 0)
4603         {
4604          locinput += UTF8SKIP(locinput);
4605         } else {
4606
4607          /* Must be  V or LV.  Take it, then match
4608          * V*     */
4609          locinput += UTF8SKIP(locinput);
4610          while (locinput < reginfo->strend
4611           && (len = is_GCB_V_utf8(locinput)))
4612          {
4613           locinput += len;
4614          }
4615         }
4616
4617         /* And any of LV, LVT, or V can be followed
4618         * by T*            */
4619         while (locinput < reginfo->strend
4620          && (len = is_GCB_T_utf8(locinput)))
4621         {
4622          locinput += len;
4623         }
4624        }
4625       }
4626      }
4627
4628      /* Match any extender */
4629      while (locinput < reginfo->strend
4630        && swash_fetch(PL_utf8_X_extend,
4631            (U8*)locinput, utf8_target))
4632      {
4633       locinput += UTF8SKIP(locinput);
4634      }
4635     }
4636    exit_utf8:
4637     if (locinput > reginfo->strend) sayNO;
4638    }
4639    break;
4640
4641   case NREFFL:  /*  /\g{name}/il  */
4642   {   /* The capture buffer cases.  The ones beginning with N for the
4643    named buffers just convert to the equivalent numbered and
4644    pretend they were called as the corresponding numbered buffer
4645    op.  */
4646    /* don't initialize these in the declaration, it makes C++
4647    unhappy */
4648    const char *s;
4649    char type;
4650    re_fold_t folder;
4651    const U8 *fold_array;
4652    UV utf8_fold_flags;
4653
4654    RX_MATCH_TAINTED_on(reginfo->prog);
4655    folder = foldEQ_locale;
4656    fold_array = PL_fold_locale;
4657    type = REFFL;
4658    utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4659    goto do_nref;
4660
4661   case NREFFA:  /*  /\g{name}/iaa  */
4662    folder = foldEQ_latin1;
4663    fold_array = PL_fold_latin1;
4664    type = REFFA;
4665    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4666    goto do_nref;
4667
4668   case NREFFU:  /*  /\g{name}/iu  */
4669    folder = foldEQ_latin1;
4670    fold_array = PL_fold_latin1;
4671    type = REFFU;
4672    utf8_fold_flags = 0;
4673    goto do_nref;
4674
4675   case NREFF:  /*  /\g{name}/i  */
4676    folder = foldEQ;
4677    fold_array = PL_fold;
4678    type = REFF;
4679    utf8_fold_flags = 0;
4680    goto do_nref;
4681
4682   case NREF:  /*  /\g{name}/   */
4683    type = REF;
4684    folder = NULL;
4685    fold_array = NULL;
4686    utf8_fold_flags = 0;
4687   do_nref:
4688
4689    /* For the named back references, find the corresponding buffer
4690    * number */
4691    n = reg_check_named_buff_matched(rex,scan);
4692
4693    if ( ! n ) {
4694     sayNO;
4695    }
4696    goto do_nref_ref_common;
4697
4698   case REFFL:  /*  /\1/il  */
4699    RX_MATCH_TAINTED_on(reginfo->prog);
4700    folder = foldEQ_locale;
4701    fold_array = PL_fold_locale;
4702    utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4703    goto do_ref;
4704
4705   case REFFA:  /*  /\1/iaa  */
4706    folder = foldEQ_latin1;
4707    fold_array = PL_fold_latin1;
4708    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4709    goto do_ref;
4710
4711   case REFFU:  /*  /\1/iu  */
4712    folder = foldEQ_latin1;
4713    fold_array = PL_fold_latin1;
4714    utf8_fold_flags = 0;
4715    goto do_ref;
4716
4717   case REFF:  /*  /\1/i  */
4718    folder = foldEQ;
4719    fold_array = PL_fold;
4720    utf8_fold_flags = 0;
4721    goto do_ref;
4722
4723   case REF:  /*  /\1/    */
4724    folder = NULL;
4725    fold_array = NULL;
4726    utf8_fold_flags = 0;
4727
4728   do_ref:
4729    type = OP(scan);
4730    n = ARG(scan);  /* which paren pair */
4731
4732   do_nref_ref_common:
4733    ln = rex->offs[n].start;
4734    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
4735    if (rex->lastparen < n || ln == -1)
4736     sayNO;   /* Do not match unless seen CLOSEn. */
4737    if (ln == rex->offs[n].end)
4738     break;
4739
4740    s = reginfo->strbeg + ln;
4741    if (type != REF /* REF can do byte comparison */
4742     && (utf8_target || type == REFFU))
4743    { /* XXX handle REFFL better */
4744     char * limit = reginfo->strend;
4745
4746     /* This call case insensitively compares the entire buffer
4747      * at s, with the current input starting at locinput, but
4748      * not going off the end given by reginfo->strend, and
4749      * returns in <limit> upon success, how much of the
4750      * current input was matched */
4751     if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4752          locinput, &limit, 0, utf8_target, utf8_fold_flags))
4753     {
4754      sayNO;
4755     }
4756     locinput = limit;
4757     break;
4758    }
4759
4760    /* Not utf8:  Inline the first character, for speed. */
4761    if (!NEXTCHR_IS_EOS &&
4762     UCHARAT(s) != nextchr &&
4763     (type == REF ||
4764     UCHARAT(s) != fold_array[nextchr]))
4765     sayNO;
4766    ln = rex->offs[n].end - ln;
4767    if (locinput + ln > reginfo->strend)
4768     sayNO;
4769    if (ln > 1 && (type == REF
4770       ? memNE(s, locinput, ln)
4771       : ! folder(s, locinput, ln)))
4772     sayNO;
4773    locinput += ln;
4774    break;
4775   }
4776
4777   case NOTHING: /* null op; e.g. the 'nothing' following
4778      * the '*' in m{(a+|b)*}' */
4779    break;
4780   case TAIL: /* placeholder while compiling (A|B|C) */
4781    break;
4782
4783   case BACK: /* ??? doesn't appear to be used ??? */
4784    break;
4785
4786 #undef  ST
4787 #define ST st->u.eval
4788   {
4789    SV *ret;
4790    REGEXP *re_sv;
4791    regexp *re;
4792    regexp_internal *rei;
4793    regnode *startpoint;
4794
4795   case GOSTART: /*  (?R)  */
4796   case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
4797    if (cur_eval && cur_eval->locinput==locinput) {
4798     if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4799      Perl_croak(aTHX_ "Infinite recursion in regex");
4800     if ( ++nochange_depth > max_nochange_depth )
4801      Perl_croak(aTHX_
4802       "Pattern subroutine nesting without pos change"
4803       " exceeded limit in regex");
4804    } else {
4805     nochange_depth = 0;
4806    }
4807    re_sv = rex_sv;
4808    re = rex;
4809    rei = rexi;
4810    if (OP(scan)==GOSUB) {
4811     startpoint = scan + ARG2L(scan);
4812     ST.close_paren = ARG(scan);
4813    } else {
4814     startpoint = rei->program+1;
4815     ST.close_paren = 0;
4816    }
4817    goto eval_recurse_doit;
4818    assert(0); /* NOTREACHED */
4819
4820   case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
4821    if (cur_eval && cur_eval->locinput==locinput) {
4822     if ( ++nochange_depth > max_nochange_depth )
4823      Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4824    } else {
4825     nochange_depth = 0;
4826    }
4827    {
4828     /* execute the code in the {...} */
4829
4830     dSP;
4831     IV before;
4832     OP * const oop = PL_op;
4833     COP * const ocurcop = PL_curcop;
4834     OP *nop;
4835     CV *newcv;
4836
4837     /* save *all* paren positions */
4838     regcppush(rex, 0, maxopenparen);
4839     REGCP_SET(runops_cp);
4840
4841     if (!caller_cv)
4842      caller_cv = find_runcv(NULL);
4843
4844     n = ARG(scan);
4845
4846     if (rexi->data->what[n] == 'r') { /* code from an external qr */
4847      newcv = (ReANY(
4848             (REGEXP*)(rexi->data->data[n])
4849            ))->qr_anoncv
4850           ;
4851      nop = (OP*)rexi->data->data[n+1];
4852     }
4853     else if (rexi->data->what[n] == 'l') { /* literal code */
4854      newcv = caller_cv;
4855      nop = (OP*)rexi->data->data[n];
4856      assert(CvDEPTH(newcv));
4857     }
4858     else {
4859      /* literal with own CV */
4860      assert(rexi->data->what[n] == 'L');
4861      newcv = rex->qr_anoncv;
4862      nop = (OP*)rexi->data->data[n];
4863     }
4864
4865     /* normally if we're about to execute code from the same
4866     * CV that we used previously, we just use the existing
4867     * CX stack entry. However, its possible that in the
4868     * meantime we may have backtracked, popped from the save
4869     * stack, and undone the SAVECOMPPAD(s) associated with
4870     * PUSH_MULTICALL; in which case PL_comppad no longer
4871     * points to newcv's pad. */
4872     if (newcv != last_pushed_cv || PL_comppad != last_pad)
4873     {
4874      U8 flags = (CXp_SUB_RE |
4875         ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
4876      if (last_pushed_cv) {
4877       CHANGE_MULTICALL_FLAGS(newcv, flags);
4878      }
4879      else {
4880       PUSH_MULTICALL_FLAGS(newcv, flags);
4881      }
4882      last_pushed_cv = newcv;
4883     }
4884     else {
4885      /* these assignments are just to silence compiler
4886      * warnings */
4887      multicall_cop = NULL;
4888      newsp = NULL;
4889     }
4890     last_pad = PL_comppad;
4891
4892     /* the initial nextstate you would normally execute
4893     * at the start of an eval (which would cause error
4894     * messages to come from the eval), may be optimised
4895     * away from the execution path in the regex code blocks;
4896     * so manually set PL_curcop to it initially */
4897     {
4898      OP *o = cUNOPx(nop)->op_first;
4899      assert(o->op_type == OP_NULL);
4900      if (o->op_targ == OP_SCOPE) {
4901       o = cUNOPo->op_first;
4902      }
4903      else {
4904       assert(o->op_targ == OP_LEAVE);
4905       o = cUNOPo->op_first;
4906       assert(o->op_type == OP_ENTER);
4907       o = o->op_sibling;
4908      }
4909
4910      if (o->op_type != OP_STUB) {
4911       assert(    o->op_type == OP_NEXTSTATE
4912         || o->op_type == OP_DBSTATE
4913         || (o->op_type == OP_NULL
4914          &&  (  o->op_targ == OP_NEXTSTATE
4915           || o->op_targ == OP_DBSTATE
4916           )
4917          )
4918       );
4919       PL_curcop = (COP*)o;
4920      }
4921     }
4922     nop = nop->op_next;
4923
4924     DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4925      "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4926
4927     rex->offs[0].end = locinput - reginfo->strbeg;
4928     if (reginfo->info_aux_eval->pos_magic)
4929       reginfo->info_aux_eval->pos_magic->mg_len
4930           = locinput - reginfo->strbeg;
4931
4932     if (sv_yes_mark) {
4933      SV *sv_mrk = get_sv("REGMARK", 1);
4934      sv_setsv(sv_mrk, sv_yes_mark);
4935     }
4936
4937     /* we don't use MULTICALL here as we want to call the
4938     * first op of the block of interest, rather than the
4939     * first op of the sub */
4940     before = (IV)(SP-PL_stack_base);
4941     PL_op = nop;
4942     CALLRUNOPS(aTHX);   /* Scalar context. */
4943     SPAGAIN;
4944     if ((IV)(SP-PL_stack_base) == before)
4945      ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
4946     else {
4947      ret = POPs;
4948      PUTBACK;
4949     }
4950
4951     /* before restoring everything, evaluate the returned
4952     * value, so that 'uninit' warnings don't use the wrong
4953     * PL_op or pad. Also need to process any magic vars
4954     * (e.g. $1) *before* parentheses are restored */
4955
4956     PL_op = NULL;
4957
4958     re_sv = NULL;
4959     if (logical == 0)        /*   (?{})/   */
4960      sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4961     else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
4962      sw = cBOOL(SvTRUE(ret));
4963      logical = 0;
4964     }
4965     else {                   /*  /(??{})  */
4966      /*  if its overloaded, let the regex compiler handle
4967      *  it; otherwise extract regex, or stringify  */
4968      if (!SvAMAGIC(ret)) {
4969       SV *sv = ret;
4970       if (SvROK(sv))
4971        sv = SvRV(sv);
4972       if (SvTYPE(sv) == SVt_REGEXP)
4973        re_sv = (REGEXP*) sv;
4974       else if (SvSMAGICAL(sv)) {
4975        MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4976        if (mg)
4977         re_sv = (REGEXP *) mg->mg_obj;
4978       }
4979
4980       /* force any magic, undef warnings here */
4981       if (!re_sv) {
4982        ret = sv_mortalcopy(ret);
4983        (void) SvPV_force_nolen(ret);
4984       }
4985      }
4986
4987     }
4988
4989     /* *** Note that at this point we don't restore
4990     * PL_comppad, (or pop the CxSUB) on the assumption it may
4991     * be used again soon. This is safe as long as nothing
4992     * in the regexp code uses the pad ! */
4993     PL_op = oop;
4994     PL_curcop = ocurcop;
4995     S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
4996
4997     if (logical != 2)
4998      break;
4999    }
5000
5001     /* only /(??{})/  from now on */
5002     logical = 0;
5003     {
5004      /* extract RE object from returned value; compiling if
5005      * necessary */
5006
5007      if (re_sv) {
5008       re_sv = reg_temp_copy(NULL, re_sv);
5009      }
5010      else {
5011       U32 pm_flags = 0;
5012
5013       if (SvUTF8(ret) && IN_BYTES) {
5014        /* In use 'bytes': make a copy of the octet
5015        * sequence, but without the flag on */
5016        STRLEN len;
5017        const char *const p = SvPV(ret, len);
5018        ret = newSVpvn_flags(p, len, SVs_TEMP);
5019       }
5020       if (rex->intflags & PREGf_USE_RE_EVAL)
5021        pm_flags |= PMf_USE_RE_EVAL;
5022
5023       /* if we got here, it should be an engine which
5024       * supports compiling code blocks and stuff */
5025       assert(rex->engine && rex->engine->op_comp);
5026       assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5027       re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5028          rex->engine, NULL, NULL,
5029          /* copy /msix etc to inner pattern */
5030          scan->flags,
5031          pm_flags);
5032
5033       if (!(SvFLAGS(ret)
5034        & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5035         | SVs_GMG))) {
5036        /* This isn't a first class regexp. Instead, it's
5037        caching a regexp onto an existing, Perl visible
5038        scalar.  */
5039        sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5040       }
5041       /* safe to do now that any $1 etc has been
5042       * interpolated into the new pattern string and
5043       * compiled */
5044       S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5045      }
5046      SAVEFREESV(re_sv);
5047      re = ReANY(re_sv);
5048     }
5049     RXp_MATCH_COPIED_off(re);
5050     re->subbeg = rex->subbeg;
5051     re->sublen = rex->sublen;
5052     re->suboffset = rex->suboffset;
5053     re->subcoffset = rex->subcoffset;
5054     rei = RXi_GET(re);
5055     DEBUG_EXECUTE_r(
5056      debug_start_match(re_sv, utf8_target, locinput,
5057          reginfo->strend, "Matching embedded");
5058     );
5059     startpoint = rei->program + 1;
5060      ST.close_paren = 0; /* only used for GOSUB */
5061
5062   eval_recurse_doit: /* Share code with GOSUB below this line */
5063     /* run the pattern returned from (??{...}) */
5064
5065     /* Save *all* the positions. */
5066     ST.cp = regcppush(rex, 0, maxopenparen);
5067     REGCP_SET(ST.lastcp);
5068
5069     re->lastparen = 0;
5070     re->lastcloseparen = 0;
5071
5072     maxopenparen = 0;
5073
5074     /* invalidate the S-L poscache. We're now executing a
5075     * different set of WHILEM ops (and their associated
5076     * indexes) against the same string, so the bits in the
5077     * cache are meaningless. Setting maxiter to zero forces
5078     * the cache to be invalidated and zeroed before reuse.
5079     * XXX This is too dramatic a measure. Ideally we should
5080     * save the old cache and restore when running the outer
5081     * pattern again */
5082     reginfo->poscache_maxiter = 0;
5083
5084     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5085
5086     ST.prev_rex = rex_sv;
5087     ST.prev_curlyx = cur_curlyx;
5088     rex_sv = re_sv;
5089     SET_reg_curpm(rex_sv);
5090     rex = re;
5091     rexi = rei;
5092     cur_curlyx = NULL;
5093     ST.B = next;
5094     ST.prev_eval = cur_eval;
5095     cur_eval = st;
5096     /* now continue from first node in postoned RE */
5097     PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5098     assert(0); /* NOTREACHED */
5099   }
5100
5101   case EVAL_AB: /* cleanup after a successful (??{A})B */
5102    /* note: this is called twice; first after popping B, then A */
5103    rex_sv = ST.prev_rex;
5104    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5105    SET_reg_curpm(rex_sv);
5106    rex = ReANY(rex_sv);
5107    rexi = RXi_GET(rex);
5108    regcpblow(ST.cp);
5109    cur_eval = ST.prev_eval;
5110    cur_curlyx = ST.prev_curlyx;
5111
5112    /* Invalidate cache. See "invalidate" comment above. */
5113    reginfo->poscache_maxiter = 0;
5114    if ( nochange_depth )
5115     nochange_depth--;
5116    sayYES;
5117
5118
5119   case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5120    /* note: this is called twice; first after popping B, then A */
5121    rex_sv = ST.prev_rex;
5122    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5123    SET_reg_curpm(rex_sv);
5124    rex = ReANY(rex_sv);
5125    rexi = RXi_GET(rex);
5126
5127    REGCP_UNWIND(ST.lastcp);
5128    regcppop(rex, &maxopenparen);
5129    cur_eval = ST.prev_eval;
5130    cur_curlyx = ST.prev_curlyx;
5131    /* Invalidate cache. See "invalidate" comment above. */
5132    reginfo->poscache_maxiter = 0;
5133    if ( nochange_depth )
5134     nochange_depth--;
5135    sayNO_SILENT;
5136 #undef ST
5137
5138   case OPEN: /*  (  */
5139    n = ARG(scan);  /* which paren pair */
5140    rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5141    if (n > maxopenparen)
5142     maxopenparen = n;
5143    DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5144     "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5145     PTR2UV(rex),
5146     PTR2UV(rex->offs),
5147     (UV)n,
5148     (IV)rex->offs[n].start_tmp,
5149     (UV)maxopenparen
5150    ));
5151    lastopen = n;
5152    break;
5153
5154 /* XXX really need to log other places start/end are set too */
5155 #define CLOSE_CAPTURE \
5156  rex->offs[n].start = rex->offs[n].start_tmp; \
5157  rex->offs[n].end = locinput - reginfo->strbeg; \
5158  DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5159   "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5160   PTR2UV(rex), \
5161   PTR2UV(rex->offs), \
5162   (UV)n, \
5163   (IV)rex->offs[n].start, \
5164   (IV)rex->offs[n].end \
5165  ))
5166
5167   case CLOSE:  /*  )  */
5168    n = ARG(scan);  /* which paren pair */
5169    CLOSE_CAPTURE;
5170    if (n > rex->lastparen)
5171     rex->lastparen = n;
5172    rex->lastcloseparen = n;
5173    if (cur_eval && cur_eval->u.eval.close_paren == n) {
5174     goto fake_end;
5175    }
5176    break;
5177
5178   case ACCEPT:  /*  (*ACCEPT)  */
5179    if (ARG(scan)){
5180     regnode *cursor;
5181     for (cursor=scan;
5182      cursor && OP(cursor)!=END;
5183      cursor=regnext(cursor))
5184     {
5185      if ( OP(cursor)==CLOSE ){
5186       n = ARG(cursor);
5187       if ( n <= lastopen ) {
5188        CLOSE_CAPTURE;
5189        if (n > rex->lastparen)
5190         rex->lastparen = n;
5191        rex->lastcloseparen = n;
5192        if ( n == ARG(scan) || (cur_eval &&
5193         cur_eval->u.eval.close_paren == n))
5194         break;
5195       }
5196      }
5197     }
5198    }
5199    goto fake_end;
5200    /*NOTREACHED*/
5201
5202   case GROUPP:  /*  (?(1))  */
5203    n = ARG(scan);  /* which paren pair */
5204    sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5205    break;
5206
5207   case NGROUPP:  /*  (?(<name>))  */
5208    /* reg_check_named_buff_matched returns 0 for no match */
5209    sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5210    break;
5211
5212   case INSUBP:   /*  (?(R))  */
5213    n = ARG(scan);
5214    sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5215    break;
5216
5217   case DEFINEP:  /*  (?(DEFINE))  */
5218    sw = 0;
5219    break;
5220
5221   case IFTHEN:   /*  (?(cond)A|B)  */
5222    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5223    if (sw)
5224     next = NEXTOPER(NEXTOPER(scan));
5225    else {
5226     next = scan + ARG(scan);
5227     if (OP(next) == IFTHEN) /* Fake one. */
5228      next = NEXTOPER(NEXTOPER(next));
5229    }
5230    break;
5231
5232   case LOGICAL:  /* modifier for EVAL and IFMATCH */
5233    logical = scan->flags;
5234    break;
5235
5236 /*******************************************************************
5237
5238 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5239 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5240 STAR/PLUS/CURLY/CURLYN are used instead.)
5241
5242 A*B is compiled as <CURLYX><A><WHILEM><B>
5243
5244 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5245 state, which contains the current count, initialised to -1. It also sets
5246 cur_curlyx to point to this state, with any previous value saved in the
5247 state block.
5248
5249 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5250 since the pattern may possibly match zero times (i.e. it's a while {} loop
5251 rather than a do {} while loop).
5252
5253 Each entry to WHILEM represents a successful match of A. The count in the
5254 CURLYX block is incremented, another WHILEM state is pushed, and execution
5255 passes to A or B depending on greediness and the current count.
5256
5257 For example, if matching against the string a1a2a3b (where the aN are
5258 substrings that match /A/), then the match progresses as follows: (the
5259 pushed states are interspersed with the bits of strings matched so far):
5260
5261  <CURLYX cnt=-1>
5262  <CURLYX cnt=0><WHILEM>
5263  <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5264  <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5265  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5266  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5267
5268 (Contrast this with something like CURLYM, which maintains only a single
5269 backtrack state:
5270
5271  <CURLYM cnt=0> a1
5272  a1 <CURLYM cnt=1> a2
5273  a1 a2 <CURLYM cnt=2> a3
5274  a1 a2 a3 <CURLYM cnt=3> b
5275 )
5276
5277 Each WHILEM state block marks a point to backtrack to upon partial failure
5278 of A or B, and also contains some minor state data related to that
5279 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5280 overall state, such as the count, and pointers to the A and B ops.
5281
5282 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5283 must always point to the *current* CURLYX block, the rules are:
5284
5285 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5286 and set cur_curlyx to point the new block.
5287
5288 When popping the CURLYX block after a successful or unsuccessful match,
5289 restore the previous cur_curlyx.
5290
5291 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5292 to the outer one saved in the CURLYX block.
5293
5294 When popping the WHILEM block after a successful or unsuccessful B match,
5295 restore the previous cur_curlyx.
5296
5297 Here's an example for the pattern (AI* BI)*BO
5298 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5299
5300 cur_
5301 curlyx backtrack stack
5302 ------ ---------------
5303 NULL
5304 CO     <CO prev=NULL> <WO>
5305 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5306 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5307 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5308
5309 At this point the pattern succeeds, and we work back down the stack to
5310 clean up, restoring as we go:
5311
5312 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5313 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5314 CO     <CO prev=NULL> <WO>
5315 NULL
5316
5317 *******************************************************************/
5318
5319 #define ST st->u.curlyx
5320
5321   case CURLYX:    /* start of /A*B/  (for complex A) */
5322   {
5323    /* No need to save/restore up to this paren */
5324    I32 parenfloor = scan->flags;
5325
5326    assert(next); /* keep Coverity happy */
5327    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5328     next += ARG(next);
5329
5330    /* XXXX Probably it is better to teach regpush to support
5331    parenfloor > maxopenparen ... */
5332    if (parenfloor > (I32)rex->lastparen)
5333     parenfloor = rex->lastparen; /* Pessimization... */
5334
5335    ST.prev_curlyx= cur_curlyx;
5336    cur_curlyx = st;
5337    ST.cp = PL_savestack_ix;
5338
5339    /* these fields contain the state of the current curly.
5340    * they are accessed by subsequent WHILEMs */
5341    ST.parenfloor = parenfloor;
5342    ST.me = scan;
5343    ST.B = next;
5344    ST.minmod = minmod;
5345    minmod = 0;
5346    ST.count = -1; /* this will be updated by WHILEM */
5347    ST.lastloc = NULL;  /* this will be updated by WHILEM */
5348
5349    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5350    assert(0); /* NOTREACHED */
5351   }
5352
5353   case CURLYX_end: /* just finished matching all of A*B */
5354    cur_curlyx = ST.prev_curlyx;
5355    sayYES;
5356    assert(0); /* NOTREACHED */
5357
5358   case CURLYX_end_fail: /* just failed to match all of A*B */
5359    regcpblow(ST.cp);
5360    cur_curlyx = ST.prev_curlyx;
5361    sayNO;
5362    assert(0); /* NOTREACHED */
5363
5364
5365 #undef ST
5366 #define ST st->u.whilem
5367
5368   case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5369   {
5370    /* see the discussion above about CURLYX/WHILEM */
5371    I32 n;
5372    int min = ARG1(cur_curlyx->u.curlyx.me);
5373    int max = ARG2(cur_curlyx->u.curlyx.me);
5374    regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5375
5376    assert(cur_curlyx); /* keep Coverity happy */
5377    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5378    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5379    ST.cache_offset = 0;
5380    ST.cache_mask = 0;
5381
5382
5383    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5384     "%*s  whilem: matched %ld out of %d..%d\n",
5385     REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5386    );
5387
5388    /* First just match a string of min A's. */
5389
5390    if (n < min) {
5391     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5392          maxopenparen);
5393     cur_curlyx->u.curlyx.lastloc = locinput;
5394     REGCP_SET(ST.lastcp);
5395
5396     PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5397     assert(0); /* NOTREACHED */
5398    }
5399
5400    /* If degenerate A matches "", assume A done. */
5401
5402    if (locinput == cur_curlyx->u.curlyx.lastloc) {
5403     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5404     "%*s  whilem: empty match detected, trying continuation...\n",
5405     REPORT_CODE_OFF+depth*2, "")
5406     );
5407     goto do_whilem_B_max;
5408    }
5409
5410    /* super-linear cache processing.
5411    *
5412    * The idea here is that for certain types of CURLYX/WHILEM -
5413    * principally those whose upper bound is infinity (and
5414    * excluding regexes that have things like \1 and other very
5415    * non-regular expresssiony things), then if a pattern like
5416    * /....A*.../ fails and we backtrack to the WHILEM, then we
5417    * make a note that this particular WHILEM op was at string
5418    * position 47 (say) when the rest of pattern failed. Then, if
5419    * we ever find ourselves back at that WHILEM, and at string
5420    * position 47 again, we can just fail immediately rather than
5421    * running the rest of the pattern again.
5422    *
5423    * This is very handy when patterns start to go
5424    * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5425    * with a combinatorial explosion of backtracking.
5426    *
5427    * The cache is implemented as a bit array, with one bit per
5428    * string byte position per WHILEM op (up to 16) - so its
5429    * between 0.25 and 2x the string size.
5430    *
5431    * To avoid allocating a poscache buffer every time, we do an
5432    * initially countdown; only after we have  executed a WHILEM
5433    * op (string-length x #WHILEMs) times do we allocate the
5434    * cache.
5435    *
5436    * The top 4 bits of scan->flags byte say how many different
5437    * relevant CURLLYX/WHILEM op pairs there are, while the
5438    * bottom 4-bits is the identifying index number of this
5439    * WHILEM.
5440    */
5441
5442    if (scan->flags) {
5443
5444     if (!reginfo->poscache_maxiter) {
5445      /* start the countdown: Postpone detection until we
5446      * know the match is not *that* much linear. */
5447      reginfo->poscache_maxiter
5448       =    (reginfo->strend - reginfo->strbeg + 1)
5449       * (scan->flags>>4);
5450      /* possible overflow for long strings and many CURLYX's */
5451      if (reginfo->poscache_maxiter < 0)
5452       reginfo->poscache_maxiter = I32_MAX;
5453      reginfo->poscache_iter = reginfo->poscache_maxiter;
5454     }
5455
5456     if (reginfo->poscache_iter-- == 0) {
5457      /* initialise cache */
5458      const I32 size = (reginfo->poscache_maxiter + 7)/8;
5459      regmatch_info_aux *const aux = reginfo->info_aux;
5460      if (aux->poscache) {
5461       if ((I32)reginfo->poscache_size < size) {
5462        Renew(aux->poscache, size, char);
5463        reginfo->poscache_size = size;
5464       }
5465       Zero(aux->poscache, size, char);
5466      }
5467      else {
5468       reginfo->poscache_size = size;
5469       Newxz(aux->poscache, size, char);
5470      }
5471      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5472  "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5473        PL_colors[4], PL_colors[5])
5474      );
5475     }
5476
5477     if (reginfo->poscache_iter < 0) {
5478      /* have we already failed at this position? */
5479      I32 offset, mask;
5480
5481      reginfo->poscache_iter = -1; /* stop eventual underflow */
5482      offset  = (scan->flags & 0xf) - 1
5483         +   (locinput - reginfo->strbeg)
5484         * (scan->flags>>4);
5485      mask    = 1 << (offset % 8);
5486      offset /= 8;
5487      if (reginfo->info_aux->poscache[offset] & mask) {
5488       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5489        "%*s  whilem: (cache) already tried at this position...\n",
5490        REPORT_CODE_OFF+depth*2, "")
5491       );
5492       sayNO; /* cache records failure */
5493      }
5494      ST.cache_offset = offset;
5495      ST.cache_mask   = mask;
5496     }
5497    }
5498
5499    /* Prefer B over A for minimal matching. */
5500
5501    if (cur_curlyx->u.curlyx.minmod) {
5502     ST.save_curlyx = cur_curlyx;
5503     cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5504     ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5505        maxopenparen);
5506     REGCP_SET(ST.lastcp);
5507     PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5508          locinput);
5509     assert(0); /* NOTREACHED */
5510    }
5511
5512    /* Prefer A over B for maximal matching. */
5513
5514    if (n < max) { /* More greed allowed? */
5515     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5516        maxopenparen);
5517     cur_curlyx->u.curlyx.lastloc = locinput;
5518     REGCP_SET(ST.lastcp);
5519     PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5520     assert(0); /* NOTREACHED */
5521    }
5522    goto do_whilem_B_max;
5523   }
5524   assert(0); /* NOTREACHED */
5525
5526   case WHILEM_B_min: /* just matched B in a minimal match */
5527   case WHILEM_B_max: /* just matched B in a maximal match */
5528    cur_curlyx = ST.save_curlyx;
5529    sayYES;
5530    assert(0); /* NOTREACHED */
5531
5532   case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5533    cur_curlyx = ST.save_curlyx;
5534    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5535    cur_curlyx->u.curlyx.count--;
5536    CACHEsayNO;
5537    assert(0); /* NOTREACHED */
5538
5539   case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5540    /* FALL THROUGH */
5541   case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5542    REGCP_UNWIND(ST.lastcp);
5543    regcppop(rex, &maxopenparen);
5544    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5545    cur_curlyx->u.curlyx.count--;
5546    CACHEsayNO;
5547    assert(0); /* NOTREACHED */
5548
5549   case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5550    REGCP_UNWIND(ST.lastcp);
5551    regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5552    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5553     "%*s  whilem: failed, trying continuation...\n",
5554     REPORT_CODE_OFF+depth*2, "")
5555    );
5556   do_whilem_B_max:
5557    if (cur_curlyx->u.curlyx.count >= REG_INFTY
5558     && ckWARN(WARN_REGEXP)
5559     && !reginfo->warned)
5560    {
5561     reginfo->warned = TRUE;
5562     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5563      "Complex regular subexpression recursion limit (%d) "
5564      "exceeded",
5565      REG_INFTY - 1);
5566    }
5567
5568    /* now try B */
5569    ST.save_curlyx = cur_curlyx;
5570    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5571    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5572         locinput);
5573    assert(0); /* NOTREACHED */
5574
5575   case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5576    cur_curlyx = ST.save_curlyx;
5577    REGCP_UNWIND(ST.lastcp);
5578    regcppop(rex, &maxopenparen);
5579
5580    if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5581     /* Maximum greed exceeded */
5582     if (cur_curlyx->u.curlyx.count >= REG_INFTY
5583      && ckWARN(WARN_REGEXP)
5584      && !reginfo->warned)
5585     {
5586      reginfo->warned = TRUE;
5587      Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5588       "Complex regular subexpression recursion "
5589       "limit (%d) exceeded",
5590       REG_INFTY - 1);
5591     }
5592     cur_curlyx->u.curlyx.count--;
5593     CACHEsayNO;
5594    }
5595
5596    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5597     "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5598    );
5599    /* Try grabbing another A and see if it helps. */
5600    cur_curlyx->u.curlyx.lastloc = locinput;
5601    ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5602        maxopenparen);
5603    REGCP_SET(ST.lastcp);
5604    PUSH_STATE_GOTO(WHILEM_A_min,
5605     /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5606     locinput);
5607    assert(0); /* NOTREACHED */
5608
5609 #undef  ST
5610 #define ST st->u.branch
5611
5612   case BRANCHJ:     /*  /(...|A|...)/ with long next pointer */
5613    next = scan + ARG(scan);
5614    if (next == scan)
5615     next = NULL;
5616    scan = NEXTOPER(scan);
5617    /* FALL THROUGH */
5618
5619   case BRANCH:     /*  /(...|A|...)/ */
5620    scan = NEXTOPER(scan); /* scan now points to inner node */
5621    ST.lastparen = rex->lastparen;
5622    ST.lastcloseparen = rex->lastcloseparen;
5623    ST.next_branch = next;
5624    REGCP_SET(ST.cp);
5625
5626    /* Now go into the branch */
5627    if (has_cutgroup) {
5628     PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5629    } else {
5630     PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5631    }
5632    assert(0); /* NOTREACHED */
5633
5634   case CUTGROUP:  /*  /(*THEN)/  */
5635    sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5636     MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5637    PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5638    assert(0); /* NOTREACHED */
5639
5640   case CUTGROUP_next_fail:
5641    do_cutgroup = 1;
5642    no_final = 1;
5643    if (st->u.mark.mark_name)
5644     sv_commit = st->u.mark.mark_name;
5645    sayNO;
5646    assert(0); /* NOTREACHED */
5647
5648   case BRANCH_next:
5649    sayYES;
5650    assert(0); /* NOTREACHED */
5651
5652   case BRANCH_next_fail: /* that branch failed; try the next, if any */
5653    if (do_cutgroup) {
5654     do_cutgroup = 0;
5655     no_final = 0;
5656    }
5657    REGCP_UNWIND(ST.cp);
5658    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5659    scan = ST.next_branch;
5660    /* no more branches? */
5661    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5662     DEBUG_EXECUTE_r({
5663      PerlIO_printf( Perl_debug_log,
5664       "%*s  %sBRANCH failed...%s\n",
5665       REPORT_CODE_OFF+depth*2, "",
5666       PL_colors[4],
5667       PL_colors[5] );
5668     });
5669     sayNO_SILENT;
5670    }
5671    continue; /* execute next BRANCH[J] op */
5672    assert(0); /* NOTREACHED */
5673
5674   case MINMOD: /* next op will be non-greedy, e.g. A*?  */
5675    minmod = 1;
5676    break;
5677
5678 #undef  ST
5679 #define ST st->u.curlym
5680
5681   case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5682
5683    /* This is an optimisation of CURLYX that enables us to push
5684    * only a single backtracking state, no matter how many matches
5685    * there are in {m,n}. It relies on the pattern being constant
5686    * length, with no parens to influence future backrefs
5687    */
5688
5689    ST.me = scan;
5690    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5691
5692    ST.lastparen      = rex->lastparen;
5693    ST.lastcloseparen = rex->lastcloseparen;
5694
5695    /* if paren positive, emulate an OPEN/CLOSE around A */
5696    if (ST.me->flags) {
5697     U32 paren = ST.me->flags;
5698     if (paren > maxopenparen)
5699      maxopenparen = paren;
5700     scan += NEXT_OFF(scan); /* Skip former OPEN. */
5701    }
5702    ST.A = scan;
5703    ST.B = next;
5704    ST.alen = 0;
5705    ST.count = 0;
5706    ST.minmod = minmod;
5707    minmod = 0;
5708    ST.c1 = CHRTEST_UNINIT;
5709    REGCP_SET(ST.cp);
5710
5711    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5712     goto curlym_do_B;
5713
5714   curlym_do_A: /* execute the A in /A{m,n}B/  */
5715    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5716    assert(0); /* NOTREACHED */
5717
5718   case CURLYM_A: /* we've just matched an A */
5719    ST.count++;
5720    /* after first match, determine A's length: u.curlym.alen */
5721    if (ST.count == 1) {
5722     if (reginfo->is_utf8_target) {
5723      char *s = st->locinput;
5724      while (s < locinput) {
5725       ST.alen++;
5726       s += UTF8SKIP(s);
5727      }
5728     }
5729     else {
5730      ST.alen = locinput - st->locinput;
5731     }
5732     if (ST.alen == 0)
5733      ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5734    }
5735    DEBUG_EXECUTE_r(
5736     PerlIO_printf(Perl_debug_log,
5737       "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5738       (int)(REPORT_CODE_OFF+(depth*2)), "",
5739       (IV) ST.count, (IV)ST.alen)
5740    );
5741
5742    if (cur_eval && cur_eval->u.eval.close_paren &&
5743     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5744     goto fake_end;
5745
5746    {
5747     I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5748     if ( max == REG_INFTY || ST.count < max )
5749      goto curlym_do_A; /* try to match another A */
5750    }
5751    goto curlym_do_B; /* try to match B */
5752
5753   case CURLYM_A_fail: /* just failed to match an A */
5754    REGCP_UNWIND(ST.cp);
5755
5756    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5757     || (cur_eval && cur_eval->u.eval.close_paren &&
5758      cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5759     sayNO;
5760
5761   curlym_do_B: /* execute the B in /A{m,n}B/  */
5762    if (ST.c1 == CHRTEST_UNINIT) {
5763     /* calculate c1 and c2 for possible match of 1st char
5764     * following curly */
5765     ST.c1 = ST.c2 = CHRTEST_VOID;
5766     if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5767      regnode *text_node = ST.B;
5768      if (! HAS_TEXT(text_node))
5769       FIND_NEXT_IMPT(text_node);
5770      /* this used to be
5771
5772       (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5773
5774        But the former is redundant in light of the latter.
5775
5776        if this changes back then the macro for
5777        IS_TEXT and friends need to change.
5778      */
5779      if (PL_regkind[OP(text_node)] == EXACT) {
5780       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5781       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5782       reginfo))
5783       {
5784        sayNO;
5785       }
5786      }
5787     }
5788    }
5789
5790    DEBUG_EXECUTE_r(
5791     PerlIO_printf(Perl_debug_log,
5792      "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
5793      (int)(REPORT_CODE_OFF+(depth*2)),
5794      "", (IV)ST.count)
5795     );
5796    if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5797     if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5798      if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5799       && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5800      {
5801       /* simulate B failing */
5802       DEBUG_OPTIMISE_r(
5803        PerlIO_printf(Perl_debug_log,
5804         "%*s  CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5805         (int)(REPORT_CODE_OFF+(depth*2)),"",
5806         valid_utf8_to_uvchr((U8 *) locinput, NULL),
5807         valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5808         valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5809       );
5810       state_num = CURLYM_B_fail;
5811       goto reenter_switch;
5812      }
5813     }
5814     else if (nextchr != ST.c1 && nextchr != ST.c2) {
5815      /* simulate B failing */
5816      DEBUG_OPTIMISE_r(
5817       PerlIO_printf(Perl_debug_log,
5818        "%*s  CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5819        (int)(REPORT_CODE_OFF+(depth*2)),"",
5820        (int) nextchr, ST.c1, ST.c2)
5821      );
5822      state_num = CURLYM_B_fail;
5823      goto reenter_switch;
5824     }
5825    }
5826
5827    if (ST.me->flags) {
5828     /* emulate CLOSE: mark current A as captured */
5829     I32 paren = ST.me->flags;
5830     if (ST.count) {
5831      rex->offs[paren].start
5832       = HOPc(locinput, -ST.alen) - reginfo->strbeg;
5833      rex->offs[paren].end = locinput - reginfo->strbeg;
5834      if ((U32)paren > rex->lastparen)
5835       rex->lastparen = paren;
5836      rex->lastcloseparen = paren;
5837     }
5838     else
5839      rex->offs[paren].end = -1;
5840     if (cur_eval && cur_eval->u.eval.close_paren &&
5841      cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5842     {
5843      if (ST.count)
5844       goto fake_end;
5845      else
5846       sayNO;
5847     }
5848    }
5849
5850    PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5851    assert(0); /* NOTREACHED */
5852
5853   case CURLYM_B_fail: /* just failed to match a B */
5854    REGCP_UNWIND(ST.cp);
5855    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5856    if (ST.minmod) {
5857     I32 max = ARG2(ST.me);
5858     if (max != REG_INFTY && ST.count == max)
5859      sayNO;
5860     goto curlym_do_A; /* try to match a further A */
5861    }
5862    /* backtrack one A */
5863    if (ST.count == ARG1(ST.me) /* min */)
5864     sayNO;
5865    ST.count--;
5866    SET_locinput(HOPc(locinput, -ST.alen));
5867    goto curlym_do_B; /* try to match B */
5868
5869 #undef ST
5870 #define ST st->u.curly
5871
5872 #define CURLY_SETPAREN(paren, success) \
5873  if (paren) { \
5874   if (success) { \
5875    rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
5876    rex->offs[paren].end = locinput - reginfo->strbeg; \
5877    if (paren > rex->lastparen) \
5878     rex->lastparen = paren; \
5879    rex->lastcloseparen = paren; \
5880   } \
5881   else { \
5882    rex->offs[paren].end = -1; \
5883    rex->lastparen      = ST.lastparen; \
5884    rex->lastcloseparen = ST.lastcloseparen; \
5885   } \
5886  }
5887
5888   case STAR:  /*  /A*B/ where A is width 1 char */
5889    ST.paren = 0;
5890    ST.min = 0;
5891    ST.max = REG_INFTY;
5892    scan = NEXTOPER(scan);
5893    goto repeat;
5894
5895   case PLUS:  /*  /A+B/ where A is width 1 char */
5896    ST.paren = 0;
5897    ST.min = 1;
5898    ST.max = REG_INFTY;
5899    scan = NEXTOPER(scan);
5900    goto repeat;
5901
5902   case CURLYN:  /*  /(A){m,n}B/ where A is width 1 char */
5903    ST.paren = scan->flags; /* Which paren to set */
5904    ST.lastparen      = rex->lastparen;
5905    ST.lastcloseparen = rex->lastcloseparen;
5906    if (ST.paren > maxopenparen)
5907     maxopenparen = ST.paren;
5908    ST.min = ARG1(scan);  /* min to match */
5909    ST.max = ARG2(scan);  /* max to match */
5910    if (cur_eval && cur_eval->u.eval.close_paren &&
5911     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5912     ST.min=1;
5913     ST.max=1;
5914    }
5915    scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5916    goto repeat;
5917
5918   case CURLY:  /*  /A{m,n}B/ where A is width 1 char */
5919    ST.paren = 0;
5920    ST.min = ARG1(scan);  /* min to match */
5921    ST.max = ARG2(scan);  /* max to match */
5922    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5923   repeat:
5924    /*
5925    * Lookahead to avoid useless match attempts
5926    * when we know what character comes next.
5927    *
5928    * Used to only do .*x and .*?x, but now it allows
5929    * for )'s, ('s and (?{ ... })'s to be in the way
5930    * of the quantifier and the EXACT-like node.  -- japhy
5931    */
5932
5933    assert(ST.min <= ST.max);
5934    if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5935     ST.c1 = ST.c2 = CHRTEST_VOID;
5936    }
5937    else {
5938     regnode *text_node = next;
5939
5940     if (! HAS_TEXT(text_node))
5941      FIND_NEXT_IMPT(text_node);
5942
5943     if (! HAS_TEXT(text_node))
5944      ST.c1 = ST.c2 = CHRTEST_VOID;
5945     else {
5946      if ( PL_regkind[OP(text_node)] != EXACT ) {
5947       ST.c1 = ST.c2 = CHRTEST_VOID;
5948      }
5949      else {
5950
5951      /*  Currently we only get here when
5952
5953       PL_rekind[OP(text_node)] == EXACT
5954
5955       if this changes back then the macro for IS_TEXT and
5956       friends need to change. */
5957       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5958       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
5959       reginfo))
5960       {
5961        sayNO;
5962       }
5963      }
5964     }
5965    }
5966
5967    ST.A = scan;
5968    ST.B = next;
5969    if (minmod) {
5970     char *li = locinput;
5971     minmod = 0;
5972     if (ST.min &&
5973       regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
5974        < ST.min)
5975      sayNO;
5976     SET_locinput(li);
5977     ST.count = ST.min;
5978     REGCP_SET(ST.cp);
5979     if (ST.c1 == CHRTEST_VOID)
5980      goto curly_try_B_min;
5981
5982     ST.oldloc = locinput;
5983
5984     /* set ST.maxpos to the furthest point along the
5985     * string that could possibly match */
5986     if  (ST.max == REG_INFTY) {
5987      ST.maxpos = reginfo->strend - 1;
5988      if (utf8_target)
5989       while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5990        ST.maxpos--;
5991     }
5992     else if (utf8_target) {
5993      int m = ST.max - ST.min;
5994      for (ST.maxpos = locinput;
5995       m >0 && ST.maxpos < reginfo->strend; m--)
5996       ST.maxpos += UTF8SKIP(ST.maxpos);
5997     }
5998     else {
5999      ST.maxpos = locinput + ST.max - ST.min;
6000      if (ST.maxpos >= reginfo->strend)
6001       ST.maxpos = reginfo->strend - 1;
6002     }
6003     goto curly_try_B_min_known;
6004
6005    }
6006    else {
6007     /* avoid taking address of locinput, so it can remain
6008     * a register var */
6009     char *li = locinput;
6010     ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6011     if (ST.count < ST.min)
6012      sayNO;
6013     SET_locinput(li);
6014     if ((ST.count > ST.min)
6015      && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6016     {
6017      /* A{m,n} must come at the end of the string, there's
6018      * no point in backing off ... */
6019      ST.min = ST.count;
6020      /* ...except that $ and \Z can match before *and* after
6021      newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6022      We may back off by one in this case. */
6023      if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6024       ST.min--;
6025     }
6026     REGCP_SET(ST.cp);
6027     goto curly_try_B_max;
6028    }
6029    assert(0); /* NOTREACHED */
6030
6031
6032   case CURLY_B_min_known_fail:
6033    /* failed to find B in a non-greedy match where c1,c2 valid */
6034
6035    REGCP_UNWIND(ST.cp);
6036    if (ST.paren) {
6037     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6038    }
6039    /* Couldn't or didn't -- move forward. */
6040    ST.oldloc = locinput;
6041    if (utf8_target)
6042     locinput += UTF8SKIP(locinput);
6043    else
6044     locinput++;
6045    ST.count++;
6046   curly_try_B_min_known:
6047    /* find the next place where 'B' could work, then call B */
6048    {
6049     int n;
6050     if (utf8_target) {
6051      n = (ST.oldloc == locinput) ? 0 : 1;
6052      if (ST.c1 == ST.c2) {
6053       /* set n to utf8_distance(oldloc, locinput) */
6054       while (locinput <= ST.maxpos
6055        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6056       {
6057        locinput += UTF8SKIP(locinput);
6058        n++;
6059       }
6060      }
6061      else {
6062       /* set n to utf8_distance(oldloc, locinput) */
6063       while (locinput <= ST.maxpos
6064        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6065        && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6066       {
6067        locinput += UTF8SKIP(locinput);
6068        n++;
6069       }
6070      }
6071     }
6072     else {  /* Not utf8_target */
6073      if (ST.c1 == ST.c2) {
6074       while (locinput <= ST.maxpos &&
6075        UCHARAT(locinput) != ST.c1)
6076        locinput++;
6077      }
6078      else {
6079       while (locinput <= ST.maxpos
6080        && UCHARAT(locinput) != ST.c1
6081        && UCHARAT(locinput) != ST.c2)
6082        locinput++;
6083      }
6084      n = locinput - ST.oldloc;
6085     }
6086     if (locinput > ST.maxpos)
6087      sayNO;
6088     if (n) {
6089      /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6090      * at b; check that everything between oldloc and
6091      * locinput matches */
6092      char *li = ST.oldloc;
6093      ST.count += n;
6094      if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6095       sayNO;
6096      assert(n == REG_INFTY || locinput == li);
6097     }
6098     CURLY_SETPAREN(ST.paren, ST.count);
6099     if (cur_eval && cur_eval->u.eval.close_paren &&
6100      cur_eval->u.eval.close_paren == (U32)ST.paren) {
6101      goto fake_end;
6102     }
6103     PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6104    }
6105    assert(0); /* NOTREACHED */
6106
6107
6108   case CURLY_B_min_fail:
6109    /* failed to find B in a non-greedy match where c1,c2 invalid */
6110
6111    REGCP_UNWIND(ST.cp);
6112    if (ST.paren) {
6113     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6114    }
6115    /* failed -- move forward one */
6116    {
6117     char *li = locinput;
6118     if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6119      sayNO;
6120     }
6121     locinput = li;
6122    }
6123    {
6124     ST.count++;
6125     if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6126       ST.count > 0)) /* count overflow ? */
6127     {
6128     curly_try_B_min:
6129      CURLY_SETPAREN(ST.paren, ST.count);
6130      if (cur_eval && cur_eval->u.eval.close_paren &&
6131       cur_eval->u.eval.close_paren == (U32)ST.paren) {
6132       goto fake_end;
6133      }
6134      PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6135     }
6136    }
6137    sayNO;
6138    assert(0); /* NOTREACHED */
6139
6140
6141   curly_try_B_max:
6142    /* a successful greedy match: now try to match B */
6143    if (cur_eval && cur_eval->u.eval.close_paren &&
6144     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6145     goto fake_end;
6146    }
6147    {
6148     bool could_match = locinput < reginfo->strend;
6149
6150     /* If it could work, try it. */
6151     if (ST.c1 != CHRTEST_VOID && could_match) {
6152      if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6153      {
6154       could_match = memEQ(locinput,
6155            ST.c1_utf8,
6156            UTF8SKIP(locinput))
6157          || memEQ(locinput,
6158            ST.c2_utf8,
6159            UTF8SKIP(locinput));
6160      }
6161      else {
6162       could_match = UCHARAT(locinput) == ST.c1
6163          || UCHARAT(locinput) == ST.c2;
6164      }
6165     }
6166     if (ST.c1 == CHRTEST_VOID || could_match) {
6167      CURLY_SETPAREN(ST.paren, ST.count);
6168      PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6169      assert(0); /* NOTREACHED */
6170     }
6171    }
6172    /* FALL THROUGH */
6173
6174   case CURLY_B_max_fail:
6175    /* failed to find B in a greedy match */
6176
6177    REGCP_UNWIND(ST.cp);
6178    if (ST.paren) {
6179     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6180    }
6181    /*  back up. */
6182    if (--ST.count < ST.min)
6183     sayNO;
6184    locinput = HOPc(locinput, -1);
6185    goto curly_try_B_max;
6186
6187 #undef ST
6188
6189   case END: /*  last op of main pattern  */
6190    fake_end:
6191    if (cur_eval) {
6192     /* we've just finished A in /(??{A})B/; now continue with B */
6193
6194     st->u.eval.prev_rex = rex_sv;  /* inner */
6195
6196     /* Save *all* the positions. */
6197     st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6198     rex_sv = cur_eval->u.eval.prev_rex;
6199     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6200     SET_reg_curpm(rex_sv);
6201     rex = ReANY(rex_sv);
6202     rexi = RXi_GET(rex);
6203     cur_curlyx = cur_eval->u.eval.prev_curlyx;
6204
6205     REGCP_SET(st->u.eval.lastcp);
6206
6207     /* Restore parens of the outer rex without popping the
6208     * savestack */
6209     S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6210           &maxopenparen);
6211
6212     st->u.eval.prev_eval = cur_eval;
6213     cur_eval = cur_eval->u.eval.prev_eval;
6214     DEBUG_EXECUTE_r(
6215      PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6216          REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6217     if ( nochange_depth )
6218      nochange_depth--;
6219
6220     PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6221          locinput); /* match B */
6222    }
6223
6224    if (locinput < reginfo->till) {
6225     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6226          "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6227          PL_colors[4],
6228          (long)(locinput - startpos),
6229          (long)(reginfo->till - startpos),
6230          PL_colors[5]));
6231
6232     sayNO_SILENT;  /* Cannot match: too short. */
6233    }
6234    sayYES;   /* Success! */
6235
6236   case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6237    DEBUG_EXECUTE_r(
6238    PerlIO_printf(Perl_debug_log,
6239     "%*s  %ssubpattern success...%s\n",
6240     REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6241    sayYES;   /* Success! */
6242
6243 #undef  ST
6244 #define ST st->u.ifmatch
6245
6246   {
6247    char *newstart;
6248
6249   case SUSPEND: /* (?>A) */
6250    ST.wanted = 1;
6251    newstart = locinput;
6252    goto do_ifmatch;
6253
6254   case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6255    ST.wanted = 0;
6256    goto ifmatch_trivial_fail_test;
6257
6258   case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6259    ST.wanted = 1;
6260   ifmatch_trivial_fail_test:
6261    if (scan->flags) {
6262     char * const s = HOPBACKc(locinput, scan->flags);
6263     if (!s) {
6264      /* trivial fail */
6265      if (logical) {
6266       logical = 0;
6267       sw = 1 - cBOOL(ST.wanted);
6268      }
6269      else if (ST.wanted)
6270       sayNO;
6271      next = scan + ARG(scan);
6272      if (next == scan)
6273       next = NULL;
6274      break;
6275     }
6276     newstart = s;
6277    }
6278    else
6279     newstart = locinput;
6280
6281   do_ifmatch:
6282    ST.me = scan;
6283    ST.logical = logical;
6284    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6285
6286    /* execute body of (?...A) */
6287    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6288    assert(0); /* NOTREACHED */
6289   }
6290
6291   case IFMATCH_A_fail: /* body of (?...A) failed */
6292    ST.wanted = !ST.wanted;
6293    /* FALL THROUGH */
6294
6295   case IFMATCH_A: /* body of (?...A) succeeded */
6296    if (ST.logical) {
6297     sw = cBOOL(ST.wanted);
6298    }
6299    else if (!ST.wanted)
6300     sayNO;
6301
6302    if (OP(ST.me) != SUSPEND) {
6303     /* restore old position except for (?>...) */
6304     locinput = st->locinput;
6305    }
6306    scan = ST.me + ARG(ST.me);
6307    if (scan == ST.me)
6308     scan = NULL;
6309    continue; /* execute B */
6310
6311 #undef ST
6312
6313   case LONGJMP: /*  alternative with many branches compiles to
6314      * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6315    next = scan + ARG(scan);
6316    if (next == scan)
6317     next = NULL;
6318    break;
6319
6320   case COMMIT:  /*  (*COMMIT)  */
6321    reginfo->cutpoint = reginfo->strend;
6322    /* FALLTHROUGH */
6323
6324   case PRUNE:   /*  (*PRUNE)   */
6325    if (!scan->flags)
6326     sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6327    PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6328    assert(0); /* NOTREACHED */
6329
6330   case COMMIT_next_fail:
6331    no_final = 1;
6332    /* FALLTHROUGH */
6333
6334   case OPFAIL:   /* (*FAIL)  */
6335    sayNO;
6336    assert(0); /* NOTREACHED */
6337
6338 #define ST st->u.mark
6339   case MARKPOINT: /*  (*MARK:foo)  */
6340    ST.prev_mark = mark_state;
6341    ST.mark_name = sv_commit = sv_yes_mark
6342     = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6343    mark_state = st;
6344    ST.mark_loc = locinput;
6345    PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6346    assert(0); /* NOTREACHED */
6347
6348   case MARKPOINT_next:
6349    mark_state = ST.prev_mark;
6350    sayYES;
6351    assert(0); /* NOTREACHED */
6352
6353   case MARKPOINT_next_fail:
6354    if (popmark && sv_eq(ST.mark_name,popmark))
6355    {
6356     if (ST.mark_loc > startpoint)
6357      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6358     popmark = NULL; /* we found our mark */
6359     sv_commit = ST.mark_name;
6360
6361     DEBUG_EXECUTE_r({
6362       PerlIO_printf(Perl_debug_log,
6363        "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6364        REPORT_CODE_OFF+depth*2, "",
6365        PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6366     });
6367    }
6368    mark_state = ST.prev_mark;
6369    sv_yes_mark = mark_state ?
6370     mark_state->u.mark.mark_name : NULL;
6371    sayNO;
6372    assert(0); /* NOTREACHED */
6373
6374   case SKIP:  /*  (*SKIP)  */
6375    if (scan->flags) {
6376     /* (*SKIP) : if we fail we cut here*/
6377     ST.mark_name = NULL;
6378     ST.mark_loc = locinput;
6379     PUSH_STATE_GOTO(SKIP_next,next, locinput);
6380    } else {
6381     /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6382     otherwise do nothing.  Meaning we need to scan
6383     */
6384     regmatch_state *cur = mark_state;
6385     SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6386
6387     while (cur) {
6388      if ( sv_eq( cur->u.mark.mark_name,
6389         find ) )
6390      {
6391       ST.mark_name = find;
6392       PUSH_STATE_GOTO( SKIP_next, next, locinput);
6393      }
6394      cur = cur->u.mark.prev_mark;
6395     }
6396    }
6397    /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6398    break;
6399
6400   case SKIP_next_fail:
6401    if (ST.mark_name) {
6402     /* (*CUT:NAME) - Set up to search for the name as we
6403     collapse the stack*/
6404     popmark = ST.mark_name;
6405    } else {
6406     /* (*CUT) - No name, we cut here.*/
6407     if (ST.mark_loc > startpoint)
6408      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6409     /* but we set sv_commit to latest mark_name if there
6410     is one so they can test to see how things lead to this
6411     cut */
6412     if (mark_state)
6413      sv_commit=mark_state->u.mark.mark_name;
6414    }
6415    no_final = 1;
6416    sayNO;
6417    assert(0); /* NOTREACHED */
6418 #undef ST
6419
6420   case LNBREAK: /* \R */
6421    if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6422     locinput += n;
6423    } else
6424     sayNO;
6425    break;
6426
6427   default:
6428    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6429       PTR2UV(scan), OP(scan));
6430    Perl_croak(aTHX_ "regexp memory corruption");
6431
6432   /* this is a point to jump to in order to increment
6433   * locinput by one character */
6434   increment_locinput:
6435    assert(!NEXTCHR_IS_EOS);
6436    if (utf8_target) {
6437     locinput += PL_utf8skip[nextchr];
6438     /* locinput is allowed to go 1 char off the end, but not 2+ */
6439     if (locinput > reginfo->strend)
6440      sayNO;
6441    }
6442    else
6443     locinput++;
6444    break;
6445
6446   } /* end switch */
6447
6448   /* switch break jumps here */
6449   scan = next; /* prepare to execute the next op and ... */
6450   continue;    /* ... jump back to the top, reusing st */
6451   assert(0); /* NOTREACHED */
6452
6453  push_yes_state:
6454   /* push a state that backtracks on success */
6455   st->u.yes.prev_yes_state = yes_state;
6456   yes_state = st;
6457   /* FALL THROUGH */
6458  push_state:
6459   /* push a new regex state, then continue at scan  */
6460   {
6461    regmatch_state *newst;
6462
6463    DEBUG_STACK_r({
6464     regmatch_state *cur = st;
6465     regmatch_state *curyes = yes_state;
6466     int curd = depth;
6467     regmatch_slab *slab = PL_regmatch_slab;
6468     for (;curd > -1;cur--,curd--) {
6469      if (cur < SLAB_FIRST(slab)) {
6470       slab = slab->prev;
6471       cur = SLAB_LAST(slab);
6472      }
6473      PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6474       REPORT_CODE_OFF + 2 + depth * 2,"",
6475       curd, PL_reg_name[cur->resume_state],
6476       (curyes == cur) ? "yes" : ""
6477      );
6478      if (curyes == cur)
6479       curyes = cur->u.yes.prev_yes_state;
6480     }
6481    } else
6482     DEBUG_STATE_pp("push")
6483    );
6484    depth++;
6485    st->locinput = locinput;
6486    newst = st+1;
6487    if (newst >  SLAB_LAST(PL_regmatch_slab))
6488     newst = S_push_slab(aTHX);
6489    PL_regmatch_state = newst;
6490
6491    locinput = pushinput;
6492    st = newst;
6493    continue;
6494    assert(0); /* NOTREACHED */
6495   }
6496  }
6497
6498  /*
6499  * We get here only if there's trouble -- normally "case END" is
6500  * the terminating point.
6501  */
6502  Perl_croak(aTHX_ "corrupted regexp pointers");
6503  /*NOTREACHED*/
6504  sayNO;
6505
6506 yes:
6507  if (yes_state) {
6508   /* we have successfully completed a subexpression, but we must now
6509   * pop to the state marked by yes_state and continue from there */
6510   assert(st != yes_state);
6511 #ifdef DEBUGGING
6512   while (st != yes_state) {
6513    st--;
6514    if (st < SLAB_FIRST(PL_regmatch_slab)) {
6515     PL_regmatch_slab = PL_regmatch_slab->prev;
6516     st = SLAB_LAST(PL_regmatch_slab);
6517    }
6518    DEBUG_STATE_r({
6519     if (no_final) {
6520      DEBUG_STATE_pp("pop (no final)");
6521     } else {
6522      DEBUG_STATE_pp("pop (yes)");
6523     }
6524    });
6525    depth--;
6526   }
6527 #else
6528   while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6529    || yes_state > SLAB_LAST(PL_regmatch_slab))
6530   {
6531    /* not in this slab, pop slab */
6532    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6533    PL_regmatch_slab = PL_regmatch_slab->prev;
6534    st = SLAB_LAST(PL_regmatch_slab);
6535   }
6536   depth -= (st - yes_state);
6537 #endif
6538   st = yes_state;
6539   yes_state = st->u.yes.prev_yes_state;
6540   PL_regmatch_state = st;
6541
6542   if (no_final)
6543    locinput= st->locinput;
6544   state_num = st->resume_state + no_final;
6545   goto reenter_switch;
6546  }
6547
6548  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6549       PL_colors[4], PL_colors[5]));
6550
6551  if (reginfo->info_aux_eval) {
6552   /* each successfully executed (?{...}) block does the equivalent of
6553   *   local $^R = do {...}
6554   * When popping the save stack, all these locals would be undone;
6555   * bypass this by setting the outermost saved $^R to the latest
6556   * value */
6557   if (oreplsv != GvSV(PL_replgv))
6558    sv_setsv(oreplsv, GvSV(PL_replgv));
6559  }
6560  result = 1;
6561  goto final_exit;
6562
6563 no:
6564  DEBUG_EXECUTE_r(
6565   PerlIO_printf(Perl_debug_log,
6566    "%*s  %sfailed...%s\n",
6567    REPORT_CODE_OFF+depth*2, "",
6568    PL_colors[4], PL_colors[5])
6569   );
6570
6571 no_silent:
6572  if (no_final) {
6573   if (yes_state) {
6574    goto yes;
6575   } else {
6576    goto final_exit;
6577   }
6578  }
6579  if (depth) {
6580   /* there's a previous state to backtrack to */
6581   st--;
6582   if (st < SLAB_FIRST(PL_regmatch_slab)) {
6583    PL_regmatch_slab = PL_regmatch_slab->prev;
6584    st = SLAB_LAST(PL_regmatch_slab);
6585   }
6586   PL_regmatch_state = st;
6587   locinput= st->locinput;
6588
6589   DEBUG_STATE_pp("pop");
6590   depth--;
6591   if (yes_state == st)
6592    yes_state = st->u.yes.prev_yes_state;
6593
6594   state_num = st->resume_state + 1; /* failure = success + 1 */
6595   goto reenter_switch;
6596  }
6597  result = 0;
6598
6599   final_exit:
6600  if (rex->intflags & PREGf_VERBARG_SEEN) {
6601   SV *sv_err = get_sv("REGERROR", 1);
6602   SV *sv_mrk = get_sv("REGMARK", 1);
6603   if (result) {
6604    sv_commit = &PL_sv_no;
6605    if (!sv_yes_mark)
6606     sv_yes_mark = &PL_sv_yes;
6607   } else {
6608    if (!sv_commit)
6609     sv_commit = &PL_sv_yes;
6610    sv_yes_mark = &PL_sv_no;
6611   }
6612   sv_setsv(sv_err, sv_commit);
6613   sv_setsv(sv_mrk, sv_yes_mark);
6614  }
6615
6616
6617  if (last_pushed_cv) {
6618   dSP;
6619   POP_MULTICALL;
6620   PERL_UNUSED_VAR(SP);
6621  }
6622
6623  assert(!result ||  locinput - reginfo->strbeg >= 0);
6624  return result ?  locinput - reginfo->strbeg : -1;
6625 }
6626
6627 /*
6628  - regrepeat - repeatedly match something simple, report how many
6629  *
6630  * What 'simple' means is a node which can be the operand of a quantifier like
6631  * '+', or {1,3}
6632  *
6633  * startposp - pointer a pointer to the start position.  This is updated
6634  *             to point to the byte following the highest successful
6635  *             match.
6636  * p         - the regnode to be repeatedly matched against.
6637  * reginfo   - struct holding match state, such as strend
6638  * max       - maximum number of things to match.
6639  * depth     - (for debugging) backtracking depth.
6640  */
6641 STATIC I32
6642 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
6643    regmatch_info *const reginfo, I32 max, int depth)
6644 {
6645  dVAR;
6646  char *scan;     /* Pointer to current position in target string */
6647  I32 c;
6648  char *loceol = reginfo->strend;   /* local version */
6649  I32 hardcount = 0;  /* How many matches so far */
6650  bool utf8_target = reginfo->is_utf8_target;
6651  int to_complement = 0;  /* Invert the result? */
6652  UV utf8_flags;
6653  _char_class_number classnum;
6654 #ifndef DEBUGGING
6655  PERL_UNUSED_ARG(depth);
6656 #endif
6657
6658  PERL_ARGS_ASSERT_REGREPEAT;
6659
6660  scan = *startposp;
6661  if (max == REG_INFTY)
6662   max = I32_MAX;
6663  else if (! utf8_target && loceol - scan > max)
6664   loceol = scan + max;
6665
6666  /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6667  * to the maximum of how far we should go in it (leaving it set to the real
6668  * end, if the maximum permissible would take us beyond that).  This allows
6669  * us to make the loop exit condition that we haven't gone past <loceol> to
6670  * also mean that we haven't exceeded the max permissible count, saving a
6671  * test each time through the loop.  But it assumes that the OP matches a
6672  * single byte, which is true for most of the OPs below when applied to a
6673  * non-UTF-8 target.  Those relatively few OPs that don't have this
6674  * characteristic will have to compensate.
6675  *
6676  * There is no adjustment for UTF-8 targets, as the number of bytes per
6677  * character varies.  OPs will have to test both that the count is less
6678  * than the max permissible (using <hardcount> to keep track), and that we
6679  * are still within the bounds of the string (using <loceol>.  A few OPs
6680  * match a single byte no matter what the encoding.  They can omit the max
6681  * test if, for the UTF-8 case, they do the adjustment that was skipped
6682  * above.
6683  *
6684  * Thus, the code above sets things up for the common case; and exceptional
6685  * cases need extra work; the common case is to make sure <scan> doesn't
6686  * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6687  * count doesn't exceed the maximum permissible */
6688
6689  switch (OP(p)) {
6690  case REG_ANY:
6691   if (utf8_target) {
6692    while (scan < loceol && hardcount < max && *scan != '\n') {
6693     scan += UTF8SKIP(scan);
6694     hardcount++;
6695    }
6696   } else {
6697    while (scan < loceol && *scan != '\n')
6698     scan++;
6699   }
6700   break;
6701  case SANY:
6702   if (utf8_target) {
6703    while (scan < loceol && hardcount < max) {
6704     scan += UTF8SKIP(scan);
6705     hardcount++;
6706    }
6707   }
6708   else
6709    scan = loceol;
6710   break;
6711  case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
6712   if (utf8_target && loceol - scan > max) {
6713
6714    /* <loceol> hadn't been adjusted in the UTF-8 case */
6715    scan +=  max;
6716   }
6717   else {
6718    scan = loceol;
6719   }
6720   break;
6721  case EXACT:
6722   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6723
6724   c = (U8)*STRING(p);
6725
6726   /* Can use a simple loop if the pattern char to match on is invariant
6727   * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
6728   * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6729   * true iff it doesn't matter if the argument is in UTF-8 or not */
6730   if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
6731    if (utf8_target && loceol - scan > max) {
6732     /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6733     * since here, to match at all, 1 char == 1 byte */
6734     loceol = scan + max;
6735    }
6736    while (scan < loceol && UCHARAT(scan) == c) {
6737     scan++;
6738    }
6739   }
6740   else if (reginfo->is_utf8_pat) {
6741    if (utf8_target) {
6742     STRLEN scan_char_len;
6743
6744     /* When both target and pattern are UTF-8, we have to do
6745     * string EQ */
6746     while (hardcount < max
6747      && scan < loceol
6748      && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6749      && memEQ(scan, STRING(p), scan_char_len))
6750     {
6751      scan += scan_char_len;
6752      hardcount++;
6753     }
6754    }
6755    else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6756
6757     /* Target isn't utf8; convert the character in the UTF-8
6758     * pattern to non-UTF8, and do a simple loop */
6759     c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6760     while (scan < loceol && UCHARAT(scan) == c) {
6761      scan++;
6762     }
6763    } /* else pattern char is above Latin1, can't possibly match the
6764     non-UTF-8 target */
6765   }
6766   else {
6767
6768    /* Here, the string must be utf8; pattern isn't, and <c> is
6769    * different in utf8 than not, so can't compare them directly.
6770    * Outside the loop, find the two utf8 bytes that represent c, and
6771    * then look for those in sequence in the utf8 string */
6772    U8 high = UTF8_TWO_BYTE_HI(c);
6773    U8 low = UTF8_TWO_BYTE_LO(c);
6774
6775    while (hardcount < max
6776      && scan + 1 < loceol
6777      && UCHARAT(scan) == high
6778      && UCHARAT(scan + 1) == low)
6779    {
6780     scan += 2;
6781     hardcount++;
6782    }
6783   }
6784   break;
6785
6786  case EXACTFA:
6787   utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6788   goto do_exactf;
6789
6790  case EXACTFL:
6791   RXp_MATCH_TAINTED_on(prog);
6792   utf8_flags = FOLDEQ_UTF8_LOCALE;
6793   goto do_exactf;
6794
6795  case EXACTF:
6796    utf8_flags = 0;
6797    goto do_exactf;
6798
6799  case EXACTFU_SS:
6800  case EXACTFU_TRICKYFOLD:
6801  case EXACTFU:
6802   utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6803
6804  do_exactf: {
6805   int c1, c2;
6806   U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6807
6808   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
6809
6810   if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
6811           reginfo))
6812   {
6813    if (c1 == CHRTEST_VOID) {
6814     /* Use full Unicode fold matching */
6815     char *tmpeol = reginfo->strend;
6816     STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
6817     while (hardcount < max
6818       && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6819            STRING(p), NULL, pat_len,
6820            reginfo->is_utf8_pat, utf8_flags))
6821     {
6822      scan = tmpeol;
6823      tmpeol = reginfo->strend;
6824      hardcount++;
6825     }
6826    }
6827    else if (utf8_target) {
6828     if (c1 == c2) {
6829      while (scan < loceol
6830       && hardcount < max
6831       && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6832      {
6833       scan += UTF8SKIP(scan);
6834       hardcount++;
6835      }
6836     }
6837     else {
6838      while (scan < loceol
6839       && hardcount < max
6840       && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6841        || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6842      {
6843       scan += UTF8SKIP(scan);
6844       hardcount++;
6845      }
6846     }
6847    }
6848    else if (c1 == c2) {
6849     while (scan < loceol && UCHARAT(scan) == c1) {
6850      scan++;
6851     }
6852    }
6853    else {
6854     while (scan < loceol &&
6855      (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6856     {
6857      scan++;
6858     }
6859    }
6860   }
6861   break;
6862  }
6863  case ANYOF:
6864  case ANYOF_WARN_SUPER:
6865   if (utf8_target) {
6866    while (hardcount < max
6867     && scan < loceol
6868     && reginclass(prog, p, (U8*)scan, utf8_target))
6869    {
6870     scan += UTF8SKIP(scan);
6871     hardcount++;
6872    }
6873   } else {
6874    while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6875     scan++;
6876   }
6877   break;
6878
6879  /* The argument (FLAGS) to all the POSIX node types is the class number */
6880
6881  case NPOSIXL:
6882   to_complement = 1;
6883   /* FALLTHROUGH */
6884
6885  case POSIXL:
6886   RXp_MATCH_TAINTED_on(prog);
6887   if (! utf8_target) {
6888    while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
6889                 *scan)))
6890    {
6891     scan++;
6892    }
6893   } else {
6894    while (hardcount < max && scan < loceol
6895     && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
6896                 (U8 *) scan)))
6897    {
6898     scan += UTF8SKIP(scan);
6899     hardcount++;
6900    }
6901   }
6902   break;
6903
6904  case POSIXD:
6905   if (utf8_target) {
6906    goto utf8_posix;
6907   }
6908   /* FALLTHROUGH */
6909
6910  case POSIXA:
6911   if (utf8_target && loceol - scan > max) {
6912
6913    /* We didn't adjust <loceol> at the beginning of this routine
6914    * because is UTF-8, but it is actually ok to do so, since here, to
6915    * match, 1 char == 1 byte. */
6916    loceol = scan + max;
6917   }
6918   while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6919    scan++;
6920   }
6921   break;
6922
6923  case NPOSIXD:
6924   if (utf8_target) {
6925    to_complement = 1;
6926    goto utf8_posix;
6927   }
6928   /* FALL THROUGH */
6929
6930  case NPOSIXA:
6931   if (! utf8_target) {
6932    while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6933     scan++;
6934    }
6935   }
6936   else {
6937
6938    /* The complement of something that matches only ASCII matches all
6939    * UTF-8 variant code points, plus everything in ASCII that isn't
6940    * in the class. */
6941    while (hardcount < max && scan < loceol
6942     && (! UTF8_IS_INVARIANT(*scan)
6943      || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
6944    {
6945     scan += UTF8SKIP(scan);
6946     hardcount++;
6947    }
6948   }
6949   break;
6950
6951  case NPOSIXU:
6952   to_complement = 1;
6953   /* FALLTHROUGH */
6954
6955  case POSIXU:
6956   if (! utf8_target) {
6957    while (scan < loceol && to_complement
6958         ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
6959    {
6960     scan++;
6961    }
6962   }
6963   else {
6964  utf8_posix:
6965    classnum = (_char_class_number) FLAGS(p);
6966    if (classnum < _FIRST_NON_SWASH_CC) {
6967
6968     /* Here, a swash is needed for above-Latin1 code points.
6969     * Process as many Latin1 code points using the built-in rules.
6970     * Go to another loop to finish processing upon encountering
6971     * the first Latin1 code point.  We could do that in this loop
6972     * as well, but the other way saves having to test if the swash
6973     * has been loaded every time through the loop: extra space to
6974     * save a test. */
6975     while (hardcount < max && scan < loceol) {
6976      if (UTF8_IS_INVARIANT(*scan)) {
6977       if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
6978                 classnum))))
6979       {
6980        break;
6981       }
6982       scan++;
6983      }
6984      else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
6985       if (! (to_complement
6986        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan,
6987                 *(scan + 1)),
6988              classnum))))
6989       {
6990        break;
6991       }
6992       scan += 2;
6993      }
6994      else {
6995       goto found_above_latin1;
6996      }
6997
6998      hardcount++;
6999     }
7000    }
7001    else {
7002     /* For these character classes, the knowledge of how to handle
7003     * every code point is compiled in to Perl via a macro.  This
7004     * code is written for making the loops as tight as possible.
7005     * It could be refactored to save space instead */
7006     switch (classnum) {
7007      case _CC_ENUM_SPACE:    /* XXX would require separate code
7008            if we revert the change of \v
7009            matching this */
7010       /* FALL THROUGH */
7011      case _CC_ENUM_PSXSPC:
7012       while (hardcount < max
7013        && scan < loceol
7014        && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7015       {
7016        scan += UTF8SKIP(scan);
7017        hardcount++;
7018       }
7019       break;
7020      case _CC_ENUM_BLANK:
7021       while (hardcount < max
7022        && scan < loceol
7023        && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7024       {
7025        scan += UTF8SKIP(scan);
7026        hardcount++;
7027       }
7028       break;
7029      case _CC_ENUM_XDIGIT:
7030       while (hardcount < max
7031        && scan < loceol
7032        && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7033       {
7034        scan += UTF8SKIP(scan);
7035        hardcount++;
7036       }
7037       break;
7038      case _CC_ENUM_VERTSPACE:
7039       while (hardcount < max
7040        && scan < loceol
7041        && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7042       {
7043        scan += UTF8SKIP(scan);
7044        hardcount++;
7045       }
7046       break;
7047      case _CC_ENUM_CNTRL:
7048       while (hardcount < max
7049        && scan < loceol
7050        && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7051       {
7052        scan += UTF8SKIP(scan);
7053        hardcount++;
7054       }
7055       break;
7056      default:
7057       Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7058     }
7059    }
7060   }
7061   break;
7062
7063  found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7064
7065   /* Load the swash if not already present */
7066   if (! PL_utf8_swash_ptrs[classnum]) {
7067    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7068    PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7069           "utf8", swash_property_names[classnum],
7070           &PL_sv_undef, 1, 0, NULL, &flags);
7071   }
7072
7073   while (hardcount < max && scan < loceol
7074    && to_complement ^ cBOOL(_generic_utf8(
7075          classnum,
7076          scan,
7077          swash_fetch(PL_utf8_swash_ptrs[classnum],
7078             (U8 *) scan,
7079             TRUE))))
7080   {
7081    scan += UTF8SKIP(scan);
7082    hardcount++;
7083   }
7084   break;
7085
7086  case LNBREAK:
7087   if (utf8_target) {
7088    while (hardcount < max && scan < loceol &&
7089      (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7090     scan += c;
7091     hardcount++;
7092    }
7093   } else {
7094    /* LNBREAK can match one or two latin chars, which is ok, but we
7095    * have to use hardcount in this situation, and throw away the
7096    * adjustment to <loceol> done before the switch statement */
7097    loceol = reginfo->strend;
7098    while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7099     scan+=c;
7100     hardcount++;
7101    }
7102   }
7103   break;
7104
7105  case BOUND:
7106  case BOUNDA:
7107  case BOUNDL:
7108  case BOUNDU:
7109  case EOS:
7110  case GPOS:
7111  case KEEPS:
7112  case NBOUND:
7113  case NBOUNDA:
7114  case NBOUNDL:
7115  case NBOUNDU:
7116  case OPFAIL:
7117  case SBOL:
7118  case SEOL:
7119   /* These are all 0 width, so match right here or not at all. */
7120   break;
7121
7122  default:
7123   Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7124   assert(0); /* NOTREACHED */
7125
7126  }
7127
7128  if (hardcount)
7129   c = hardcount;
7130  else
7131   c = scan - *startposp;
7132  *startposp = scan;
7133
7134  DEBUG_r({
7135   GET_RE_DEBUG_FLAGS_DECL;
7136   DEBUG_EXECUTE_r({
7137    SV * const prop = sv_newmortal();
7138    regprop(prog, prop, p);
7139    PerlIO_printf(Perl_debug_log,
7140       "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7141       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7142   });
7143  });
7144
7145  return(c);
7146 }
7147
7148
7149 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7150 /*
7151 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7152 create a copy so that changes the caller makes won't change the shared one.
7153 If <altsvp> is non-null, will return NULL in it, for back-compat.
7154  */
7155 SV *
7156 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7157 {
7158  PERL_ARGS_ASSERT_REGCLASS_SWASH;
7159
7160  if (altsvp) {
7161   *altsvp = NULL;
7162  }
7163
7164  return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7165 }
7166 #endif
7167
7168 STATIC SV *
7169 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7170 {
7171  /* Returns the swash for the input 'node' in the regex 'prog'.
7172  * If <doinit> is true, will attempt to create the swash if not already
7173  *   done.
7174  * If <listsvp> is non-null, will return the swash initialization string in
7175  *   it.
7176  * Tied intimately to how regcomp.c sets up the data structure */
7177
7178  dVAR;
7179  SV *sw  = NULL;
7180  SV *si  = NULL;
7181  SV*  invlist = NULL;
7182
7183  RXi_GET_DECL(prog,progi);
7184  const struct reg_data * const data = prog ? progi->data : NULL;
7185
7186  PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7187
7188  assert(ANYOF_NONBITMAP(node));
7189
7190  if (data && data->count) {
7191   const U32 n = ARG(node);
7192
7193   if (data->what[n] == 's') {
7194    SV * const rv = MUTABLE_SV(data->data[n]);
7195    AV * const av = MUTABLE_AV(SvRV(rv));
7196    SV **const ary = AvARRAY(av);
7197    U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7198
7199    si = *ary; /* ary[0] = the string to initialize the swash with */
7200
7201    /* Elements 2 and 3 are either both present or both absent. [2] is
7202    * any inversion list generated at compile time; [3] indicates if
7203    * that inversion list has any user-defined properties in it. */
7204    if (av_len(av) >= 2) {
7205     invlist = ary[2];
7206     if (SvUV(ary[3])) {
7207      swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7208     }
7209    }
7210    else {
7211     invlist = NULL;
7212    }
7213
7214    /* Element [1] is reserved for the set-up swash.  If already there,
7215    * return it; if not, create it and store it there */
7216    if (SvROK(ary[1])) {
7217     sw = ary[1];
7218    }
7219    else if (si && doinit) {
7220
7221     sw = _core_swash_init("utf8", /* the utf8 package */
7222          "", /* nameless */
7223          si,
7224          1, /* binary */
7225          0, /* not from tr/// */
7226          invlist,
7227          &swash_init_flags);
7228     (void)av_store(av, 1, sw);
7229    }
7230   }
7231  }
7232
7233  if (listsvp) {
7234   SV* matches_string = newSVpvn("", 0);
7235
7236   /* Use the swash, if any, which has to have incorporated into it all
7237   * possibilities */
7238   if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7239    && (si && si != &PL_sv_undef))
7240   {
7241
7242    /* If no swash, use the input initialization string, if available */
7243    sv_catsv(matches_string, si);
7244   }
7245
7246   /* Add the inversion list to whatever we have.  This may have come from
7247   * the swash, or from an input parameter */
7248   if (invlist) {
7249    sv_catsv(matches_string, _invlist_contents(invlist));
7250   }
7251   *listsvp = matches_string;
7252  }
7253
7254  return sw;
7255 }
7256
7257 /*
7258  - reginclass - determine if a character falls into a character class
7259
7260   n is the ANYOF regnode
7261   p is the target string
7262   utf8_target tells whether p is in UTF-8.
7263
7264   Returns true if matched; false otherwise.
7265
7266   Note that this can be a synthetic start class, a combination of various
7267   nodes, so things you think might be mutually exclusive, such as locale,
7268   aren't.  It can match both locale and non-locale
7269
7270  */
7271
7272 STATIC bool
7273 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7274 {
7275  dVAR;
7276  const char flags = ANYOF_FLAGS(n);
7277  bool match = FALSE;
7278  UV c = *p;
7279
7280  PERL_ARGS_ASSERT_REGINCLASS;
7281
7282  /* If c is not already the code point, get it.  Note that
7283  * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7284  if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7285   STRLEN c_len = 0;
7286   c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7287     (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7288     | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7289     /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7290     * UTF8_ALLOW_FFFF */
7291   if (c_len == (STRLEN)-1)
7292    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7293  }
7294
7295  /* If this character is potentially in the bitmap, check it */
7296  if (c < 256) {
7297   if (ANYOF_BITMAP_TEST(n, c))
7298    match = TRUE;
7299   else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7300     && ! utf8_target
7301     && ! isASCII(c))
7302   {
7303    match = TRUE;
7304   }
7305   else if (flags & ANYOF_LOCALE) {
7306    RXp_MATCH_TAINTED_on(prog);
7307
7308    if ((flags & ANYOF_LOC_FOLD)
7309     && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7310    {
7311     match = TRUE;
7312    }
7313    else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7314
7315     /* The data structure is arranged so bits 0, 2, 4, ... are set
7316     * if the class includes the Posix character class given by
7317     * bit/2; and 1, 3, 5, ... are set if the class includes the
7318     * complemented Posix class given by int(bit/2).  So we loop
7319     * through the bits, each time changing whether we complement
7320     * the result or not.  Suppose for the sake of illustration
7321     * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7322     * is set, it means there is a match for this ANYOF node if the
7323     * character is in the class given by the expression (0 / 2 = 0
7324     * = \w).  If it is in that class, isFOO_lc() will return 1,
7325     * and since 'to_complement' is 0, the result will stay TRUE,
7326     * and we exit the loop.  Suppose instead that bit 0 is 0, but
7327     * bit 1 is 1.  That means there is a match if the character
7328     * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7329     * but will on bit 1.  On the second iteration 'to_complement'
7330     * will be 1, so the exclusive or will reverse things, so we
7331     * are testing for \W.  On the third iteration, 'to_complement'
7332     * will be 0, and we would be testing for \s; the fourth
7333     * iteration would test for \S, etc.
7334     *
7335     * Note that this code assumes that all the classes are closed
7336     * under folding.  For example, if a character matches \w, then
7337     * its fold does too; and vice versa.  This should be true for
7338     * any well-behaved locale for all the currently defined Posix
7339     * classes, except for :lower: and :upper:, which are handled
7340     * by the pseudo-class :cased: which matches if either of the
7341     * other two does.  To get rid of this assumption, an outer
7342     * loop could be used below to iterate over both the source
7343     * character, and its fold (if different) */
7344
7345     int count = 0;
7346     int to_complement = 0;
7347     while (count < ANYOF_MAX) {
7348      if (ANYOF_CLASS_TEST(n, count)
7349       && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7350      {
7351       match = TRUE;
7352       break;
7353      }
7354      count++;
7355      to_complement ^= 1;
7356     }
7357    }
7358   }
7359  }
7360
7361  /* If the bitmap didn't (or couldn't) match, and something outside the
7362  * bitmap could match, try that.  Locale nodes specify completely the
7363  * behavior of code points in the bit map (otherwise, a utf8 target would
7364  * cause them to be treated as Unicode and not locale), except in
7365  * the very unlikely event when this node is a synthetic start class, which
7366  * could be a combination of locale and non-locale nodes.  So allow locale
7367  * to match for the synthetic start class, which will give a false
7368  * positive that will be resolved when the match is done again as not part
7369  * of the synthetic start class */
7370  if (!match) {
7371   if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7372    match = TRUE; /* Everything above 255 matches */
7373   }
7374   else if (ANYOF_NONBITMAP(n)
7375     && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7376      || (utf8_target
7377       && (c >=256
7378        || (! (flags & ANYOF_LOCALE))
7379        || OP(n) == ANYOF_SYNTHETIC))))
7380   {
7381    SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7382    if (sw) {
7383     U8 * utf8_p;
7384     if (utf8_target) {
7385      utf8_p = (U8 *) p;
7386     } else { /* Convert to utf8 */
7387      STRLEN len = 1;
7388      utf8_p = bytes_to_utf8(p, &len);
7389     }
7390
7391     if (swash_fetch(sw, utf8_p, TRUE)) {
7392      match = TRUE;
7393     }
7394
7395     /* If we allocated a string above, free it */
7396     if (! utf8_target) Safefree(utf8_p);
7397    }
7398   }
7399
7400   if (UNICODE_IS_SUPER(c)
7401    && OP(n) == ANYOF_WARN_SUPER
7402    && ckWARN_d(WARN_NON_UNICODE))
7403   {
7404    Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7405     "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7406   }
7407  }
7408
7409  /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7410  return cBOOL(flags & ANYOF_INVERT) ^ match;
7411 }
7412
7413 STATIC U8 *
7414 S_reghop3(U8 *s, I32 off, const U8* lim)
7415 {
7416  /* return the position 'off' UTF-8 characters away from 's', forward if
7417  * 'off' >= 0, backwards if negative.  But don't go outside of position
7418  * 'lim', which better be < s  if off < 0 */
7419
7420  dVAR;
7421
7422  PERL_ARGS_ASSERT_REGHOP3;
7423
7424  if (off >= 0) {
7425   while (off-- && s < lim) {
7426    /* XXX could check well-formedness here */
7427    s += UTF8SKIP(s);
7428   }
7429  }
7430  else {
7431   while (off++ && s > lim) {
7432    s--;
7433    if (UTF8_IS_CONTINUED(*s)) {
7434     while (s > lim && UTF8_IS_CONTINUATION(*s))
7435      s--;
7436    }
7437    /* XXX could check well-formedness here */
7438   }
7439  }
7440  return s;
7441 }
7442
7443 #ifdef XXX_dmq
7444 /* there are a bunch of places where we use two reghop3's that should
7445    be replaced with this routine. but since thats not done yet
7446    we ifdef it out - dmq
7447 */
7448 STATIC U8 *
7449 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7450 {
7451  dVAR;
7452
7453  PERL_ARGS_ASSERT_REGHOP4;
7454
7455  if (off >= 0) {
7456   while (off-- && s < rlim) {
7457    /* XXX could check well-formedness here */
7458    s += UTF8SKIP(s);
7459   }
7460  }
7461  else {
7462   while (off++ && s > llim) {
7463    s--;
7464    if (UTF8_IS_CONTINUED(*s)) {
7465     while (s > llim && UTF8_IS_CONTINUATION(*s))
7466      s--;
7467    }
7468    /* XXX could check well-formedness here */
7469   }
7470  }
7471  return s;
7472 }
7473 #endif
7474
7475 STATIC U8 *
7476 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7477 {
7478  dVAR;
7479
7480  PERL_ARGS_ASSERT_REGHOPMAYBE3;
7481
7482  if (off >= 0) {
7483   while (off-- && s < lim) {
7484    /* XXX could check well-formedness here */
7485    s += UTF8SKIP(s);
7486   }
7487   if (off >= 0)
7488    return NULL;
7489  }
7490  else {
7491   while (off++ && s > lim) {
7492    s--;
7493    if (UTF8_IS_CONTINUED(*s)) {
7494     while (s > lim && UTF8_IS_CONTINUATION(*s))
7495      s--;
7496    }
7497    /* XXX could check well-formedness here */
7498   }
7499   if (off <= 0)
7500    return NULL;
7501  }
7502  return s;
7503 }
7504
7505
7506 /* when executing a regex that may have (?{}), extra stuff needs setting
7507    up that will be visible to the called code, even before the current
7508    match has finished. In particular:
7509
7510    * $_ is localised to the SV currently being matched;
7511    * pos($_) is created if necessary, ready to be updated on each call-out
7512  to code;
7513    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7514  isn't set until the current pattern is successfully finished), so that
7515  $1 etc of the match-so-far can be seen;
7516    * save the old values of subbeg etc of the current regex, and  set then
7517  to the current string (again, this is normally only done at the end
7518  of execution)
7519 */
7520
7521 static void
7522 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7523 {
7524  MAGIC *mg;
7525  regexp *const rex = ReANY(reginfo->prog);
7526  regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7527
7528  eval_state->rex = rex;
7529
7530  if (reginfo->sv) {
7531   /* Make $_ available to executed code. */
7532   if (reginfo->sv != DEFSV) {
7533    SAVE_DEFSV;
7534    DEFSV_set(reginfo->sv);
7535   }
7536
7537   if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
7538    && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
7539    /* prepare for quick setting of pos */
7540 #ifdef PERL_OLD_COPY_ON_WRITE
7541    if (SvIsCOW(reginfo->sv))
7542     sv_force_normal_flags(reginfo->sv, 0);
7543 #endif
7544    mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
7545        &PL_vtbl_mglob, NULL, 0);
7546    mg->mg_len = -1;
7547   }
7548   eval_state->pos_magic = mg;
7549   eval_state->pos       = mg->mg_len;
7550  }
7551  else
7552   eval_state->pos_magic = NULL;
7553
7554  if (!PL_reg_curpm) {
7555   /* PL_reg_curpm is a fake PMOP that we can attach the current
7556   * regex to and point PL_curpm at, so that $1 et al are visible
7557   * within a /(?{})/. It's just allocated once per interpreter the
7558   * first time its needed */
7559   Newxz(PL_reg_curpm, 1, PMOP);
7560 #ifdef USE_ITHREADS
7561   {
7562    SV* const repointer = &PL_sv_undef;
7563    /* this regexp is also owned by the new PL_reg_curpm, which
7564    will try to free it.  */
7565    av_push(PL_regex_padav, repointer);
7566    PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
7567    PL_regex_pad = AvARRAY(PL_regex_padav);
7568   }
7569 #endif
7570  }
7571  SET_reg_curpm(reginfo->prog);
7572  eval_state->curpm = PL_curpm;
7573  PL_curpm = PL_reg_curpm;
7574  if (RXp_MATCH_COPIED(rex)) {
7575   /*  Here is a serious problem: we cannot rewrite subbeg,
7576    since it may be needed if this match fails.  Thus
7577    $` inside (?{}) could fail... */
7578   eval_state->subbeg     = rex->subbeg;
7579   eval_state->sublen     = rex->sublen;
7580   eval_state->suboffset  = rex->suboffset;
7581   eval_state->subcoffset = rex->subcoffset;
7582 #ifdef PERL_ANY_COW
7583   eval_state->saved_copy = rex->saved_copy;
7584 #endif
7585   RXp_MATCH_COPIED_off(rex);
7586  }
7587  else
7588   eval_state->subbeg = NULL;
7589  rex->subbeg = (char *)reginfo->strbeg;
7590  rex->suboffset = 0;
7591  rex->subcoffset = 0;
7592  rex->sublen = reginfo->strend - reginfo->strbeg;
7593 }
7594
7595
7596 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7597
7598 static void
7599 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7600 {
7601  dVAR;
7602  regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7603  regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
7604  regmatch_slab *s;
7605
7606  Safefree(aux->poscache);
7607
7608  if (eval_state) {
7609
7610   /* undo the effects of S_setup_eval_state() */
7611
7612   if (eval_state->subbeg) {
7613    regexp * const rex = eval_state->rex;
7614    rex->subbeg     = eval_state->subbeg;
7615    rex->sublen     = eval_state->sublen;
7616    rex->suboffset  = eval_state->suboffset;
7617    rex->subcoffset = eval_state->subcoffset;
7618 #ifdef PERL_ANY_COW
7619    rex->saved_copy = eval_state->saved_copy;
7620 #endif
7621    RXp_MATCH_COPIED_on(rex);
7622   }
7623   if (eval_state->pos_magic)
7624    eval_state->pos_magic->mg_len = eval_state->pos;
7625
7626   PL_curpm = eval_state->curpm;
7627  }
7628
7629  PL_regmatch_state = aux->old_regmatch_state;
7630  PL_regmatch_slab  = aux->old_regmatch_slab;
7631
7632  /* free all slabs above current one - this must be the last action
7633  * of this function, as aux and eval_state are allocated within
7634  * slabs and may be freed here */
7635
7636  s = PL_regmatch_slab->next;
7637  if (s) {
7638   PL_regmatch_slab->next = NULL;
7639   while (s) {
7640    regmatch_slab * const osl = s;
7641    s = s->next;
7642    Safefree(osl);
7643   }
7644  }
7645 }
7646
7647
7648 STATIC void
7649 S_to_utf8_substr(pTHX_ regexp *prog)
7650 {
7651  /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7652  * on the converted value */
7653
7654  int i = 1;
7655
7656  PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7657
7658  do {
7659   if (prog->substrs->data[i].substr
7660    && !prog->substrs->data[i].utf8_substr) {
7661    SV* const sv = newSVsv(prog->substrs->data[i].substr);
7662    prog->substrs->data[i].utf8_substr = sv;
7663    sv_utf8_upgrade(sv);
7664    if (SvVALID(prog->substrs->data[i].substr)) {
7665     if (SvTAIL(prog->substrs->data[i].substr)) {
7666      /* Trim the trailing \n that fbm_compile added last
7667      time.  */
7668      SvCUR_set(sv, SvCUR(sv) - 1);
7669      /* Whilst this makes the SV technically "invalid" (as its
7670      buffer is no longer followed by "\0") when fbm_compile()
7671      adds the "\n" back, a "\0" is restored.  */
7672      fbm_compile(sv, FBMcf_TAIL);
7673     } else
7674      fbm_compile(sv, 0);
7675    }
7676    if (prog->substrs->data[i].substr == prog->check_substr)
7677     prog->check_utf8 = sv;
7678   }
7679  } while (i--);
7680 }
7681
7682 STATIC bool
7683 S_to_byte_substr(pTHX_ regexp *prog)
7684 {
7685  /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7686  * on the converted value; returns FALSE if can't be converted. */
7687
7688  dVAR;
7689  int i = 1;
7690
7691  PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7692
7693  do {
7694   if (prog->substrs->data[i].utf8_substr
7695    && !prog->substrs->data[i].substr) {
7696    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7697    if (! sv_utf8_downgrade(sv, TRUE)) {
7698     return FALSE;
7699    }
7700    if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7701     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7702      /* Trim the trailing \n that fbm_compile added last
7703       time.  */
7704      SvCUR_set(sv, SvCUR(sv) - 1);
7705      fbm_compile(sv, FBMcf_TAIL);
7706     } else
7707      fbm_compile(sv, 0);
7708    }
7709    prog->substrs->data[i].substr = sv;
7710    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7711     prog->check_substr = sv;
7712   }
7713  } while (i--);
7714
7715  return TRUE;
7716 }
7717
7718 /*
7719  * Local variables:
7720  * c-indentation-style: bsd
7721  * c-basic-offset: 4
7722  * indent-tabs-mode: nil
7723  * End:
7724  *
7725  * ex: set ts=8 sts=4 sw=4 et:
7726  */