src/5021004/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *  One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  * Copyright (c) 1986 by University of Toronto.
  44  * Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  * Permission is granted to anyone to use this software for any
  47  * purpose on any computer system, and to redistribute it freely,
  48  * subject to the following restrictions:
  49  *
  50  * 1. The author is not responsible for the consequences of use of
  51  *  this software, no matter how awful, even if they arise
  52  *  from defects in it.
  53  *
  54  * 2. The origin of this software must not be misrepresented, either
  55  *  by explicit claim or by omission.
  56  *
  57  * 3. Altered versions must be plainly marked as such, and must not
  58  *  be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #include "inline_invlist.c"
  85 #include "unicode_constants.h"
  86
  87 #ifdef DEBUGGING
  88 /* At least one required character in the target string is expressible only in
  89  * UTF-8. */
  90 static const char* const non_utf8_target_but_utf8_required
  91     = "Can't match, because target string needs to be in UTF-8\n";
  92 #endif
  93
  94 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  95  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  96  goto target; \
  97 } STMT_END
  98
  99 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 100
 101 #ifndef STATIC
 102 #define STATIC static
 103 #endif
 104
 105 /* Valid only for non-utf8 strings: avoids the reginclass
 106  * call if there are no complications: i.e., if everything matchable is
 107  * straight forward in the bitmap */
 108 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0)   \
 109            : ANYOF_BITMAP_TEST(p,*(c)))
 110
 111 /*
 112  * Forwards.
 113  */
 114
 115 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 116 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 117
 118 #define HOPc(pos,off) \
 119   (char *)(reginfo->is_utf8_target \
 120    ? reghop3((U8*)pos, off, \
 121      (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 122    : (U8*)(pos + off))
 123
 124 #define HOPBACKc(pos, off) \
 125   (char*)(reginfo->is_utf8_target \
 126    ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 127    : (pos - off >= reginfo->strbeg) \
 128     ? (U8*)pos - off  \
 129     : NULL)
 130
 131 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 132 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 133
 134 /* lim must be +ve. Returns NULL on overshoot */
 135 #define HOPMAYBE3(pos,off,lim) \
 136   (reginfo->is_utf8_target                        \
 137    ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 138    : ((U8*)pos + off <= lim)                   \
 139     ? (U8*)pos + off                        \
 140     : NULL)
 141
 142 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 143  * off must be >=0; args should be vars rather than expressions */
 144 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 145  ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 146  : (U8*)((pos + off) > lim ? lim : (pos + off)))
 147
 148 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 149  ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 150  : (U8*)(pos + off))
 151 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 152
 153 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 154 #define NEXTCHR_IS_EOS (nextchr < 0)
 155
 156 #define SET_nextchr \
 157  nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 158
 159 #define SET_locinput(p) \
 160  locinput = (p);  \
 161  SET_nextchr
 162
 163
 164 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 165   if (!swash_ptr) {                                                     \
 166    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 167    swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 168           1, 0, invlist, &flags);              \
 169    assert(swash_ptr);                                                \
 170   }                                                                     \
 171  } STMT_END
 172
 173 /* If in debug mode, we test that a known character properly matches */
 174 #ifdef DEBUGGING
 175 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 176           property_name,                      \
 177           invlist,                            \
 178           utf8_char_in_property)              \
 179   LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 180   assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 181 #else
 182 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 183           property_name,                      \
 184           invlist,                            \
 185           utf8_char_in_property)              \
 186   LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 187 #endif
 188
 189 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 190           PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 191           "",                                   \
 192           PL_XPosix_ptrs[_CC_WORDCHAR],         \
 193           LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
 194
 195 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 196  STMT_START {                                                              \
 197   LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 198          "_X_regular_begin",                    \
 199          NULL,                                  \
 200          LATIN_CAPITAL_LETTER_SHARP_S_UTF8);    \
 201   LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 202          "_X_extend",                           \
 203          NULL,                                  \
 204          COMBINING_GRAVE_ACCENT_UTF8);          \
 205  } STMT_END
 206
 207 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
 208 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 209
 210 /* for use after a quantifier and before an EXACT-like node -- japhy */
 211 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 212  *
 213  * NOTE that *nothing* that affects backtracking should be in here, specifically
 214  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 215  * node that is in between two EXACT like nodes when ascertaining what the required
 216  * "follow" character is. This should probably be moved to regex compile time
 217  * although it may be done at run time beause of the REF possibility - more
 218  * investigation required. -- demerphq
 219 */
 220 #define JUMPABLE(rn) (                                                             \
 221  OP(rn) == OPEN ||                                                              \
 222  (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 223  OP(rn) == EVAL ||                                                              \
 224  OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 225  OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 226  OP(rn) == KEEPS ||                                                             \
 227  (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 228 )
 229 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 230
 231 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 232
 233 #if 0
 234 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 235    we don't need this definition. */
 236 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 237 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 238 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 239
 240 #else
 241 /* ... so we use this as its faster. */
 242 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 243 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 244 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 245 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 246
 247 #endif
 248
 249 /*
 250   Search for mandatory following text node; for lookahead, the text must
 251   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 252 */
 253 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 254  while (JUMPABLE(rn)) { \
 255   const OPCODE type = OP(rn); \
 256   if (type == SUSPEND || PL_regkind[type] == CURLY) \
 257    rn = NEXTOPER(NEXTOPER(rn)); \
 258   else if (type == PLUS) \
 259    rn = NEXTOPER(rn); \
 260   else if (type == IFMATCH) \
 261    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 262   else rn += NEXT_OFF(rn); \
 263  } \
 264 } STMT_END
 265
 266 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 267  * These are for the pre-composed Hangul syllables, which are all in a
 268  * contiguous block and arranged there in such a way so as to facilitate
 269  * alorithmic determination of their characteristics.  As such, they don't need
 270  * a swash, but can be determined by simple arithmetic.  Almost all are
 271  * GCB=LVT, but every 28th one is a GCB=LV */
 272 #define SBASE 0xAC00    /* Start of block */
 273 #define SCount 11172    /* Length of block */
 274 #define TCount 28
 275
 276 #define SLAB_FIRST(s) (&(s)->states[0])
 277 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 278
 279 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 280 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 281 static regmatch_state * S_push_slab(pTHX);
 282
 283 #define REGCP_PAREN_ELEMS 3
 284 #define REGCP_OTHER_ELEMS 3
 285 #define REGCP_FRAME_ELEMS 1
 286 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 287  * are needed for the regexp context stack bookkeeping. */
 288
 289 STATIC CHECKPOINT
 290 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 291 {
 292  const int retval = PL_savestack_ix;
 293  const int paren_elems_to_push =
 294     (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 295  const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 296  const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 297  I32 p;
 298  GET_RE_DEBUG_FLAGS_DECL;
 299
 300  PERL_ARGS_ASSERT_REGCPPUSH;
 301
 302  if (paren_elems_to_push < 0)
 303   Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 304     (int)paren_elems_to_push, (int)maxopenparen,
 305     (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 306
 307  if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 308   Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 309     " out of range (%lu-%ld)",
 310     total_elems,
 311     (unsigned long)maxopenparen,
 312     (long)parenfloor);
 313
 314  SSGROW(total_elems + REGCP_FRAME_ELEMS);
 315
 316  DEBUG_BUFFERS_r(
 317   if ((int)maxopenparen > (int)parenfloor)
 318    PerlIO_printf(Perl_debug_log,
 319     "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 320     PTR2UV(rex),
 321     PTR2UV(rex->offs)
 322    );
 323  );
 324  for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 325 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 326   SSPUSHIV(rex->offs[p].end);
 327   SSPUSHIV(rex->offs[p].start);
 328   SSPUSHINT(rex->offs[p].start_tmp);
 329   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 330    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 331    (UV)p,
 332    (IV)rex->offs[p].start,
 333    (IV)rex->offs[p].start_tmp,
 334    (IV)rex->offs[p].end
 335   ));
 336  }
 337 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 338  SSPUSHINT(maxopenparen);
 339  SSPUSHINT(rex->lastparen);
 340  SSPUSHINT(rex->lastcloseparen);
 341  SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 342
 343  return retval;
 344 }
 345
 346 /* These are needed since we do not localize EVAL nodes: */
 347 #define REGCP_SET(cp)                                           \
 348  DEBUG_STATE_r(                                              \
 349    PerlIO_printf(Perl_debug_log,          \
 350     "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 351     (IV)PL_savestack_ix));                          \
 352  cp = PL_savestack_ix
 353
 354 #define REGCP_UNWIND(cp)                                        \
 355  DEBUG_STATE_r(                                              \
 356   if (cp != PL_savestack_ix)                   \
 357     PerlIO_printf(Perl_debug_log,          \
 358     "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 359     (IV)(cp), (IV)PL_savestack_ix));                \
 360  regcpblow(cp)
 361
 362 #define UNWIND_PAREN(lp, lcp)               \
 363  for (n = rex->lastparen; n > lp; n--)   \
 364   rex->offs[n].end = -1;              \
 365  rex->lastparen = n;                     \
 366  rex->lastcloseparen = lcp;
 367
 368
 369 STATIC void
 370 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 371 {
 372  UV i;
 373  U32 paren;
 374  GET_RE_DEBUG_FLAGS_DECL;
 375
 376  PERL_ARGS_ASSERT_REGCPPOP;
 377
 378  /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 379  i = SSPOPUV;
 380  assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 381  i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 382  rex->lastcloseparen = SSPOPINT;
 383  rex->lastparen = SSPOPINT;
 384  *maxopenparen_p = SSPOPINT;
 385
 386  i -= REGCP_OTHER_ELEMS;
 387  /* Now restore the parentheses context. */
 388  DEBUG_BUFFERS_r(
 389   if (i || rex->lastparen + 1 <= rex->nparens)
 390    PerlIO_printf(Perl_debug_log,
 391     "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 392     PTR2UV(rex),
 393     PTR2UV(rex->offs)
 394    );
 395  );
 396  paren = *maxopenparen_p;
 397  for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 398   SSize_t tmps;
 399   rex->offs[paren].start_tmp = SSPOPINT;
 400   rex->offs[paren].start = SSPOPIV;
 401   tmps = SSPOPIV;
 402   if (paren <= rex->lastparen)
 403    rex->offs[paren].end = tmps;
 404   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 405    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 406    (UV)paren,
 407    (IV)rex->offs[paren].start,
 408    (IV)rex->offs[paren].start_tmp,
 409    (IV)rex->offs[paren].end,
 410    (paren > rex->lastparen ? "(skipped)" : ""));
 411   );
 412   paren--;
 413  }
 414 #if 1
 415  /* It would seem that the similar code in regtry()
 416  * already takes care of this, and in fact it is in
 417  * a better location to since this code can #if 0-ed out
 418  * but the code in regtry() is needed or otherwise tests
 419  * requiring null fields (pat.t#187 and split.t#{13,14}
 420  * (as of patchlevel 7877)  will fail.  Then again,
 421  * this code seems to be necessary or otherwise
 422  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 423  * --jhi updated by dapm */
 424  for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 425   if (i > *maxopenparen_p)
 426    rex->offs[i].start = -1;
 427   rex->offs[i].end = -1;
 428   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 429    "    \\%"UVuf": %s   ..-1 undeffing\n",
 430    (UV)i,
 431    (i > *maxopenparen_p) ? "-1" : "  "
 432   ));
 433  }
 434 #endif
 435 }
 436
 437 /* restore the parens and associated vars at savestack position ix,
 438  * but without popping the stack */
 439
 440 STATIC void
 441 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 442 {
 443  I32 tmpix = PL_savestack_ix;
 444  PL_savestack_ix = ix;
 445  regcppop(rex, maxopenparen_p);
 446  PL_savestack_ix = tmpix;
 447 }
 448
 449 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
 450
 451 STATIC bool
 452 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 453 {
 454  /* Returns a boolean as to whether or not 'character' is a member of the
 455  * Posix character class given by 'classnum' that should be equivalent to a
 456  * value in the typedef '_char_class_number'.
 457  *
 458  * Ideally this could be replaced by a just an array of function pointers
 459  * to the C library functions that implement the macros this calls.
 460  * However, to compile, the precise function signatures are required, and
 461  * these may vary from platform to to platform.  To avoid having to figure
 462  * out what those all are on each platform, I (khw) am using this method,
 463  * which adds an extra layer of function call overhead (unless the C
 464  * optimizer strips it away).  But we don't particularly care about
 465  * performance with locales anyway. */
 466
 467  switch ((_char_class_number) classnum) {
 468   case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 469   case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 470   case _CC_ENUM_ASCII:     return isASCII_LC(character);
 471   case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 472   case _CC_ENUM_CASED:     return isLOWER_LC(character)
 473           || isUPPER_LC(character);
 474   case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 475   case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 476   case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 477   case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 478   case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 479   case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 480   case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 481   case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 482   case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 483   case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 484   case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 485   default:    /* VERTSPACE should never occur in locales */
 486    Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 487  }
 488
 489  assert(0); /* NOTREACHED */
 490  return FALSE;
 491 }
 492
 493 STATIC bool
 494 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 495 {
 496  /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 497  * 'character' is a member of the Posix character class given by 'classnum'
 498  * that should be equivalent to a value in the typedef
 499  * '_char_class_number'.
 500  *
 501  * This just calls isFOO_lc on the code point for the character if it is in
 502  * the range 0-255.  Outside that range, all characters avoid Unicode
 503  * rules, ignoring any locale.  So use the Unicode function if this class
 504  * requires a swash, and use the Unicode macro otherwise. */
 505
 506  PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 507
 508  if (UTF8_IS_INVARIANT(*character)) {
 509   return isFOO_lc(classnum, *character);
 510  }
 511  else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 512   return isFOO_lc(classnum,
 513       TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
 514  }
 515
 516  if (classnum < _FIRST_NON_SWASH_CC) {
 517
 518   /* Initialize the swash unless done already */
 519   if (! PL_utf8_swash_ptrs[classnum]) {
 520    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 521    PL_utf8_swash_ptrs[classnum] =
 522      _core_swash_init("utf8",
 523          "",
 524          &PL_sv_undef, 1, 0,
 525          PL_XPosix_ptrs[classnum], &flags);
 526   }
 527
 528   return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 529         character,
 530         TRUE /* is UTF */ ));
 531  }
 532
 533  switch ((_char_class_number) classnum) {
 534   case _CC_ENUM_SPACE:
 535   case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 536
 537   case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 538   case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 539   case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 540   default:                 break;
 541  }
 542
 543  return FALSE; /* Things like CNTRL are always below 256 */
 544 }
 545
 546 /*
 547  * pregexec and friends
 548  */
 549
 550 #ifndef PERL_IN_XSUB_RE
 551 /*
 552  - pregexec - match a regexp against a string
 553  */
 554 I32
 555 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 556   char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 557 /* stringarg: the point in the string at which to begin matching */
 558 /* strend:    pointer to null at end of string */
 559 /* strbeg:    real beginning of string */
 560 /* minend:    end of match must be >= minend bytes after stringarg. */
 561 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 562  *            itself is accessed via the pointers above */
 563 /* nosave:    For optimizations. */
 564 {
 565  PERL_ARGS_ASSERT_PREGEXEC;
 566
 567  return
 568   regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 569      nosave ? 0 : REXEC_COPY_STR);
 570 }
 571 #endif
 572
 573
 574
 575 /* re_intuit_start():
 576  *
 577  * Based on some optimiser hints, try to find the earliest position in the
 578  * string where the regex could match.
 579  *
 580  *   rx:     the regex to match against
 581  *   sv:     the SV being matched: only used for utf8 flag; the string
 582  *           itself is accessed via the pointers below. Note that on
 583  *           something like an overloaded SV, SvPOK(sv) may be false
 584  *           and the string pointers may point to something unrelated to
 585  *           the SV itself.
 586  *   strbeg: real beginning of string
 587  *   strpos: the point in the string at which to begin matching
 588  *   strend: pointer to the byte following the last char of the string
 589  *   flags   currently unused; set to 0
 590  *   data:   currently unused; set to NULL
 591  *
 592  * The basic idea of re_intuit_start() is to use some known information
 593  * about the pattern, namely:
 594  *
 595  *   a) the longest known anchored substring (i.e. one that's at a
 596  *      constant offset from the beginning of the pattern; but not
 597  *      necessarily at a fixed offset from the beginning of the
 598  *      string);
 599  *   b) the longest floating substring (i.e. one that's not at a constant
 600  *      offset from the beginning of the pattern);
 601  *   c) Whether the pattern is anchored to the string; either
 602  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 603  *      or anchored to pos(): /\G/;
 604  *   d) A start class: a real or synthetic character class which
 605  *      represents which characters are legal at the start of the pattern;
 606  *
 607  * to either quickly reject the match, or to find the earliest position
 608  * within the string at which the pattern might match, thus avoiding
 609  * running the full NFA engine at those earlier locations, only to
 610  * eventually fail and retry further along.
 611  *
 612  * Returns NULL if the pattern can't match, or returns the address within
 613  * the string which is the earliest place the match could occur.
 614  *
 615  * The longest of the anchored and floating substrings is called 'check'
 616  * and is checked first. The other is called 'other' and is checked
 617  * second. The 'other' substring may not be present.  For example,
 618  *
 619  *    /(abc|xyz)ABC\d{0,3}DEFG/
 620  *
 621  * will have
 622  *
 623  *   check substr (float)    = "DEFG", offset 6..9 chars
 624  *   other substr (anchored) = "ABC",  offset 3..3 chars
 625  *   stclass = [ax]
 626  *
 627  * Be aware that during the course of this function, sometimes 'anchored'
 628  * refers to a substring being anchored relative to the start of the
 629  * pattern, and sometimes to the pattern itself being anchored relative to
 630  * the string. For example:
 631  *
 632  *   /\dabc/:   "abc" is anchored to the pattern;
 633  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 634  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 635  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 636  *                    but the pattern is anchored to the string.
 637  */
 638
 639 char *
 640 Perl_re_intuit_start(pTHX_
 641      REGEXP * const rx,
 642      SV *sv,
 643      const char * const strbeg,
 644      char *strpos,
 645      char *strend,
 646      const U32 flags,
 647      re_scream_pos_data *data)
 648 {
 649  struct regexp *const prog = ReANY(rx);
 650  SSize_t start_shift = prog->check_offset_min;
 651  /* Should be nonnegative! */
 652  SSize_t end_shift   = 0;
 653  /* current lowest pos in string where the regex can start matching */
 654  char *rx_origin = strpos;
 655  SV *check;
 656  const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 657  U8   other_ix = 1 - prog->substrs->check_ix;
 658  bool ml_anch = 0;
 659  char *other_last = strpos;/* latest pos 'other' substr already checked to */
 660  char *check_at = NULL;  /* check substr found at this pos */
 661  const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 662  RXi_GET_DECL(prog,progi);
 663  regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 664  regmatch_info *const reginfo = &reginfo_buf;
 665  GET_RE_DEBUG_FLAGS_DECL;
 666
 667  PERL_ARGS_ASSERT_RE_INTUIT_START;
 668  PERL_UNUSED_ARG(flags);
 669  PERL_UNUSED_ARG(data);
 670
 671  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 672     "Intuit: trying to determine minimum start position...\n"));
 673
 674  /* for now, assume that all substr offsets are positive. If at some point
 675  * in the future someone wants to do clever things with look-behind and
 676  * -ve offsets, they'll need to fix up any code in this function
 677  * which uses these offsets. See the thread beginning
 678  * <20140113145929.GF27210@iabyn.com>
 679  */
 680  assert(prog->substrs->data[0].min_offset >= 0);
 681  assert(prog->substrs->data[0].max_offset >= 0);
 682  assert(prog->substrs->data[1].min_offset >= 0);
 683  assert(prog->substrs->data[1].max_offset >= 0);
 684  assert(prog->substrs->data[2].min_offset >= 0);
 685  assert(prog->substrs->data[2].max_offset >= 0);
 686
 687  /* for now, assume that if both present, that the floating substring
 688  * doesn't start before the anchored substring.
 689  * If you break this assumption (e.g. doing better optimisations
 690  * with lookahead/behind), then you'll need to audit the code in this
 691  * function carefully first
 692  */
 693  assert(
 694    ! (  (prog->anchored_utf8 || prog->anchored_substr)
 695    && (prog->float_utf8    || prog->float_substr))
 696   || (prog->float_min_offset >= prog->anchored_offset));
 697
 698  /* byte rather than char calculation for efficiency. It fails
 699  * to quickly reject some cases that can't match, but will reject
 700  * them later after doing full char arithmetic */
 701  if (prog->minlen > strend - strpos) {
 702   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 703        "  String too short...\n"));
 704   goto fail;
 705  }
 706
 707  reginfo->is_utf8_target = cBOOL(utf8_target);
 708  reginfo->info_aux = NULL;
 709  reginfo->strbeg = strbeg;
 710  reginfo->strend = strend;
 711  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 712  reginfo->intuit = 1;
 713  /* not actually used within intuit, but zero for safety anyway */
 714  reginfo->poscache_maxiter = 0;
 715
 716  if (utf8_target) {
 717   if (!prog->check_utf8 && prog->check_substr)
 718    to_utf8_substr(prog);
 719   check = prog->check_utf8;
 720  } else {
 721   if (!prog->check_substr && prog->check_utf8) {
 722    if (! to_byte_substr(prog)) {
 723     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 724    }
 725   }
 726   check = prog->check_substr;
 727  }
 728
 729  /* dump the various substring data */
 730  DEBUG_OPTIMISE_MORE_r({
 731   int i;
 732   for (i=0; i<=2; i++) {
 733    SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 734         : prog->substrs->data[i].substr);
 735    if (!sv)
 736     continue;
 737
 738    PerlIO_printf(Perl_debug_log,
 739     "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 740     " useful=%"IVdf" utf8=%d [%s]\n",
 741     i,
 742     (IV)prog->substrs->data[i].min_offset,
 743     (IV)prog->substrs->data[i].max_offset,
 744     (IV)prog->substrs->data[i].end_shift,
 745     BmUSEFUL(sv),
 746     utf8_target ? 1 : 0,
 747     SvPEEK(sv));
 748   }
 749  });
 750
 751  if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 752
 753   /* ml_anch: check after \n?
 754   *
 755   * A note about IMPLICIT: on an un-anchored pattern beginning
 756   * with /.*.../, these flags will have been added by the
 757   * compiler:
 758   *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 759   *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 760   */
 761   ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 762     && !(prog->intflags & PREGf_IMPLICIT);
 763
 764   if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 765    /* we are only allowed to match at BOS or \G */
 766
 767    /* trivially reject if there's a BOS anchor and we're not at BOS.
 768    *
 769    * Note that we don't try to do a similar quick reject for
 770    * \G, since generally the caller will have calculated strpos
 771    * based on pos() and gofs, so the string is already correctly
 772    * anchored by definition; and handling the exceptions would
 773    * be too fiddly (e.g. REXEC_IGNOREPOS).
 774    */
 775    if (   strpos != strbeg
 776     && (prog->intflags & PREGf_ANCH_SBOL))
 777    {
 778     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 779         "  Not at start...\n"));
 780     goto fail;
 781    }
 782
 783    /* in the presence of an anchor, the anchored (relative to the
 784    * start of the regex) substr must also be anchored relative
 785    * to strpos. So quickly reject if substr isn't found there.
 786    * This works for \G too, because the caller will already have
 787    * subtracted gofs from pos, and gofs is the offset from the
 788    * \G to the start of the regex. For example, in /.abc\Gdef/,
 789    * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 790    * caller will have set strpos=pos()-4; we look for the substr
 791    * at position pos()-4+1, which lines up with the "a" */
 792
 793    if (prog->check_offset_min == prog->check_offset_max
 794     && !(prog->intflags & PREGf_CANY_SEEN))
 795    {
 796     /* Substring at constant offset from beg-of-str... */
 797     SSize_t slen = SvCUR(check);
 798     char *s = HOP3c(strpos, prog->check_offset_min, strend);
 799
 800     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 801      "  Looking for check substr at fixed offset %"IVdf"...\n",
 802      (IV)prog->check_offset_min));
 803
 804     if (SvTAIL(check)) {
 805      /* In this case, the regex is anchored at the end too.
 806      * Unless it's a multiline match, the lengths must match
 807      * exactly, give or take a \n.  NB: slen >= 1 since
 808      * the last char of check is \n */
 809      if (!multiline
 810       && (   strend - s > slen
 811        || strend - s < slen - 1
 812        || (strend - s == slen && strend[-1] != '\n')))
 813      {
 814       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 815            "  String too long...\n"));
 816       goto fail_finish;
 817      }
 818      /* Now should match s[0..slen-2] */
 819      slen--;
 820     }
 821     if (slen && (*SvPVX_const(check) != *s
 822      || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
 823     {
 824      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 825          "  String not equal...\n"));
 826      goto fail_finish;
 827     }
 828
 829     check_at = s;
 830     goto success_at_start;
 831    }
 832   }
 833  }
 834
 835  end_shift = prog->check_end_shift;
 836
 837 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
 838  if (end_shift < 0)
 839   Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 840     (IV)end_shift, RX_PRECOMP(prog));
 841 #endif
 842
 843   restart:
 844
 845  /* This is the (re)entry point of the main loop in this function.
 846  * The goal of this loop is to:
 847  * 1) find the "check" substring in the region rx_origin..strend
 848  *    (adjusted by start_shift / end_shift). If not found, reject
 849  *    immediately.
 850  * 2) If it exists, look for the "other" substr too if defined; for
 851  *    example, if the check substr maps to the anchored substr, then
 852  *    check the floating substr, and vice-versa. If not found, go
 853  *    back to (1) with rx_origin suitably incremented.
 854  * 3) If we find an rx_origin position that doesn't contradict
 855  *    either of the substrings, then check the possible additional
 856  *    constraints on rx_origin of /^.../m or a known start class.
 857  *    If these fail, then depending on which constraints fail, jump
 858  *    back to here, or to various other re-entry points further along
 859  *    that skip some of the first steps.
 860  * 4) If we pass all those tests, update the BmUSEFUL() count on the
 861  *    substring. If the start position was determined to be at the
 862  *    beginning of the string  - so, not rejected, but not optimised,
 863  *    since we have to run regmatch from position 0 - decrement the
 864  *    BmUSEFUL() count. Otherwise increment it.
 865  */
 866
 867
 868  /* first, look for the 'check' substring */
 869
 870  {
 871   U8* start_point;
 872   U8* end_point;
 873
 874   DEBUG_OPTIMISE_MORE_r({
 875    PerlIO_printf(Perl_debug_log,
 876     "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 877     " Start shift: %"IVdf" End shift %"IVdf
 878     " Real end Shift: %"IVdf"\n",
 879     (IV)(rx_origin - strpos),
 880     (IV)prog->check_offset_min,
 881     (IV)start_shift,
 882     (IV)end_shift,
 883     (IV)prog->check_end_shift);
 884   });
 885
 886   if (prog->intflags & PREGf_CANY_SEEN) {
 887    start_point= (U8*)(rx_origin + start_shift);
 888    end_point= (U8*)(strend - end_shift);
 889    if (start_point > end_point)
 890     goto fail_finish;
 891   } else {
 892    end_point = HOP3(strend, -end_shift, strbeg);
 893    start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 894    if (!start_point)
 895     goto fail_finish;
 896   }
 897
 898
 899   /* If the regex is absolutely anchored to either the start of the
 900   * string (SBOL) or to pos() (ANCH_GPOS), then
 901   * check_offset_max represents an upper bound on the string where
 902   * the substr could start. For the ANCH_GPOS case, we assume that
 903   * the caller of intuit will have already set strpos to
 904   * pos()-gofs, so in this case strpos + offset_max will still be
 905   * an upper bound on the substr.
 906   */
 907   if (!ml_anch
 908    && prog->intflags & PREGf_ANCH
 909    && prog->check_offset_max != SSize_t_MAX)
 910   {
 911    SSize_t len = SvCUR(check) - !!SvTAIL(check);
 912    const char * const anchor =
 913       (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 914
 915    /* do a bytes rather than chars comparison. It's conservative;
 916    * so it skips doing the HOP if the result can't possibly end
 917    * up earlier than the old value of end_point.
 918    */
 919    if ((char*)end_point - anchor > prog->check_offset_max) {
 920     end_point = HOP3lim((U8*)anchor,
 921         prog->check_offset_max,
 922         end_point -len)
 923        + len;
 924    }
 925   }
 926
 927   DEBUG_OPTIMISE_MORE_r({
 928    PerlIO_printf(Perl_debug_log, "  fbm_instr len=%d str=<%.*s>\n",
 929     (int)(end_point - start_point),
 930     (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 931     start_point);
 932   });
 933
 934   check_at = fbm_instr( start_point, end_point,
 935      check, multiline ? FBMrf_MULTILINE : 0);
 936
 937   /* Update the count-of-usability, remove useless subpatterns,
 938    unshift s.  */
 939
 940   DEBUG_EXECUTE_r({
 941    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 942     SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 943    PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s%s",
 944        (check_at ? "Found" : "Did not find"),
 945     (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 946      ? "anchored" : "floating"),
 947     quoted,
 948     RE_SV_TAIL(check),
 949     (check_at ? " at offset " : "...\n") );
 950   });
 951
 952   if (!check_at)
 953    goto fail_finish;
 954   /* Finish the diagnostic message */
 955   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
 956
 957   /* set rx_origin to the minimum position where the regex could start
 958   * matching, given the constraint of the just-matched check substring.
 959   * But don't set it lower than previously.
 960   */
 961
 962   if (check_at - rx_origin > prog->check_offset_max)
 963    rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 964  }
 965
 966
 967  /* now look for the 'other' substring if defined */
 968
 969  if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 970      : prog->substrs->data[other_ix].substr)
 971  {
 972   /* Take into account the "other" substring. */
 973   char *last, *last1;
 974   char *s;
 975   SV* must;
 976   struct reg_substr_datum *other;
 977
 978  do_other_substr:
 979   other = &prog->substrs->data[other_ix];
 980
 981   /* if "other" is anchored:
 982   * we've previously found a floating substr starting at check_at.
 983   * This means that the regex origin must lie somewhere
 984   * between min (rx_origin): HOP3(check_at, -check_offset_max)
 985   * and max:                 HOP3(check_at, -check_offset_min)
 986   * (except that min will be >= strpos)
 987   * So the fixed  substr must lie somewhere between
 988   *  HOP3(min, anchored_offset)
 989   *  HOP3(max, anchored_offset) + SvCUR(substr)
 990   */
 991
 992   /* if "other" is floating
 993   * Calculate last1, the absolute latest point where the
 994   * floating substr could start in the string, ignoring any
 995   * constraints from the earlier fixed match. It is calculated
 996   * as follows:
 997   *
 998   * strend - prog->minlen (in chars) is the absolute latest
 999   * position within the string where the origin of the regex
1000   * could appear. The latest start point for the floating
1001   * substr is float_min_offset(*) on from the start of the
1002   * regex.  last1 simply combines thee two offsets.
1003   *
1004   * (*) You might think the latest start point should be
1005   * float_max_offset from the regex origin, and technically
1006   * you'd be correct. However, consider
1007   *    /a\d{2,4}bcd\w/
1008   * Here, float min, max are 3,5 and minlen is 7.
1009   * This can match either
1010   *    /a\d\dbcd\w/
1011   *    /a\d\d\dbcd\w/
1012   *    /a\d\d\d\dbcd\w/
1013   * In the first case, the regex matches minlen chars; in the
1014   * second, minlen+1, in the third, minlen+2.
1015   * In the first case, the floating offset is 3 (which equals
1016   * float_min), in the second, 4, and in the third, 5 (which
1017   * equals float_max). In all cases, the floating string bcd
1018   * can never start more than 4 chars from the end of the
1019   * string, which equals minlen - float_min. As the substring
1020   * starts to match more than float_min from the start of the
1021   * regex, it makes the regex match more than minlen chars,
1022   * and the two cancel each other out. So we can always use
1023   * float_min - minlen, rather than float_max - minlen for the
1024   * latest position in the string.
1025   *
1026   * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1027   * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1028   */
1029
1030   assert(prog->minlen >= other->min_offset);
1031   last1 = HOP3c(strend,
1032       other->min_offset - prog->minlen, strbeg);
1033
1034   if (other_ix) {/* i.e. if (other-is-float) */
1035    /* last is the latest point where the floating substr could
1036    * start, *given* any constraints from the earlier fixed
1037    * match. This constraint is that the floating string starts
1038    * <= float_max_offset chars from the regex origin (rx_origin).
1039    * If this value is less than last1, use it instead.
1040    */
1041    assert(rx_origin <= last1);
1042    last =
1043     /* this condition handles the offset==infinity case, and
1044     * is a short-cut otherwise. Although it's comparing a
1045     * byte offset to a char length, it does so in a safe way,
1046     * since 1 char always occupies 1 or more bytes,
1047     * so if a string range is  (last1 - rx_origin) bytes,
1048     * it will be less than or equal to  (last1 - rx_origin)
1049     * chars; meaning it errs towards doing the accurate HOP3
1050     * rather than just using last1 as a short-cut */
1051     (last1 - rx_origin) < other->max_offset
1052      ? last1
1053      : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1054   }
1055   else {
1056    assert(strpos + start_shift <= check_at);
1057    last = HOP4c(check_at, other->min_offset - start_shift,
1058       strbeg, strend);
1059   }
1060
1061   s = HOP3c(rx_origin, other->min_offset, strend);
1062   if (s < other_last) /* These positions already checked */
1063    s = other_last;
1064
1065   must = utf8_target ? other->utf8_substr : other->substr;
1066   assert(SvPOK(must));
1067   s = fbm_instr(
1068    (unsigned char*)s,
1069    (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1070    must,
1071    multiline ? FBMrf_MULTILINE : 0
1072   );
1073   DEBUG_EXECUTE_r({
1074    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1075     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1076    PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s",
1077     s ? "Found" : "Contradicts",
1078     other_ix ? "floating" : "anchored",
1079     quoted, RE_SV_TAIL(must));
1080   });
1081
1082
1083   if (!s) {
1084    /* last1 is latest possible substr location. If we didn't
1085    * find it before there, we never will */
1086    if (last >= last1) {
1087     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1088           ", giving up...\n"));
1089     goto fail_finish;
1090    }
1091
1092    /* try to find the check substr again at a later
1093    * position. Maybe next time we'll find the "other" substr
1094    * in range too */
1095    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1096     ", trying %s at offset %ld...\n",
1097     (other_ix ? "floating" : "anchored"),
1098     (long)(HOP3c(check_at, 1, strend) - strpos)));
1099
1100    other_last = HOP3c(last, 1, strend) /* highest failure */;
1101    rx_origin =
1102     other_ix /* i.e. if other-is-float */
1103      ? HOP3c(rx_origin, 1, strend)
1104      : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1105    goto restart;
1106   }
1107   else {
1108    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
1109     (long)(s - strpos)));
1110
1111    if (other_ix) { /* if (other-is-float) */
1112     /* other_last is set to s, not s+1, since its possible for
1113     * a floating substr to fail first time, then succeed
1114     * second time at the same floating position; e.g.:
1115     *     "-AB--AABZ" =~ /\wAB\d*Z/
1116     * The first time round, anchored and float match at
1117     * "-(AB)--AAB(Z)" then fail on the initial \w character
1118     * class. Second time round, they match at "-AB--A(AB)(Z)".
1119     */
1120     other_last = s;
1121    }
1122    else {
1123     rx_origin = HOP3c(s, -other->min_offset, strbeg);
1124     other_last = HOP3c(s, 1, strend);
1125    }
1126   }
1127  }
1128  else {
1129   DEBUG_OPTIMISE_MORE_r(
1130    PerlIO_printf(Perl_debug_log,
1131     "  Check-only match: offset min:%"IVdf" max:%"IVdf
1132     " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1133     " strend-strpos:%"IVdf"\n",
1134     (IV)prog->check_offset_min,
1135     (IV)prog->check_offset_max,
1136     (IV)(check_at-strpos),
1137     (IV)(rx_origin-strpos),
1138     (IV)(rx_origin-check_at),
1139     (IV)(strend-strpos)
1140    )
1141   );
1142  }
1143
1144   postprocess_substr_matches:
1145
1146  /* handle the extra constraint of /^.../m if present */
1147
1148  if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1149   char *s;
1150
1151   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1152       "  looking for /^/m anchor"));
1153
1154   /* we have failed the constraint of a \n before rx_origin.
1155   * Find the next \n, if any, even if it's beyond the current
1156   * anchored and/or floating substrings. Whether we should be
1157   * scanning ahead for the next \n or the next substr is debatable.
1158   * On the one hand you'd expect rare substrings to appear less
1159   * often than \n's. On the other hand, searching for \n means
1160   * we're effectively flipping been check_substr and "\n" on each
1161   * iteration as the current "rarest" string candidate, which
1162   * means for example that we'll quickly reject the whole string if
1163   * hasn't got a \n, rather than trying every substr position
1164   * first
1165   */
1166
1167   s = HOP3c(strend, - prog->minlen, strpos);
1168   if (s <= rx_origin ||
1169    ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1170   {
1171    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1172        "  Did not find /%s^%s/m...\n",
1173        PL_colors[0], PL_colors[1]));
1174    goto fail_finish;
1175   }
1176
1177   /* earliest possible origin is 1 char after the \n.
1178   * (since *rx_origin == '\n', it's safe to ++ here rather than
1179   * HOP(rx_origin, 1)) */
1180   rx_origin++;
1181
1182   if (prog->substrs->check_ix == 0  /* check is anchored */
1183    || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1184   {
1185    /* Position contradicts check-string; either because
1186    * check was anchored (and thus has no wiggle room),
1187    * or check was float and rx_origin is above the float range */
1188    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1189     "  Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1190     PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1191    goto restart;
1192   }
1193
1194   /* if we get here, the check substr must have been float,
1195   * is in range, and we may or may not have had an anchored
1196   * "other" substr which still contradicts */
1197   assert(prog->substrs->check_ix); /* check is float */
1198
1199   if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1200    /* whoops, the anchored "other" substr exists, so we still
1201    * contradict. On the other hand, the float "check" substr
1202    * didn't contradict, so just retry the anchored "other"
1203    * substr */
1204    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1205     "  Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1206     PL_colors[0], PL_colors[1],
1207     (long)(rx_origin - strpos),
1208     (long)(rx_origin - strpos + prog->anchored_offset)));
1209    goto do_other_substr;
1210   }
1211
1212   /* success: we don't contradict the found floating substring
1213   * (and there's no anchored substr). */
1214   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1215    "  Found /%s^%s/m at offset %ld...\n",
1216    PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1217  }
1218  else {
1219   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1220    "  (multiline anchor test skipped)\n"));
1221  }
1222
1223   success_at_start:
1224
1225
1226  /* if we have a starting character class, then test that extra constraint.
1227  * (trie stclasses are too expensive to use here, we are better off to
1228  * leave it to regmatch itself) */
1229
1230  if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1231   const U8* const str = (U8*)STRING(progi->regstclass);
1232
1233   /* XXX this value could be pre-computed */
1234   const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1235      ?  (reginfo->is_utf8_pat
1236       ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1237       : STR_LEN(progi->regstclass))
1238      : 1);
1239   char * endpos;
1240   char *s;
1241   /* latest pos that a matching float substr constrains rx start to */
1242   char *rx_max_float = NULL;
1243
1244   /* if the current rx_origin is anchored, either by satisfying an
1245   * anchored substring constraint, or a /^.../m constraint, then we
1246   * can reject the current origin if the start class isn't found
1247   * at the current position. If we have a float-only match, then
1248   * rx_origin is constrained to a range; so look for the start class
1249   * in that range. if neither, then look for the start class in the
1250   * whole rest of the string */
1251
1252   /* XXX DAPM it's not clear what the minlen test is for, and why
1253   * it's not used in the floating case. Nothing in the test suite
1254   * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1255   * Here are some old comments, which may or may not be correct:
1256   *
1257   *   minlen == 0 is possible if regstclass is \b or \B,
1258   *   and the fixed substr is ''$.
1259   *   Since minlen is already taken into account, rx_origin+1 is
1260   *   before strend; accidentally, minlen >= 1 guaranties no false
1261   *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1262   *   0) below assumes that regstclass does not come from lookahead...
1263   *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1264   *   This leaves EXACTF-ish only, which are dealt with in
1265   *   find_byclass().
1266   */
1267
1268   if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1269    endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1270   else if (prog->float_substr || prog->float_utf8) {
1271    rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1272    endpos= HOP3c(rx_max_float, cl_l, strend);
1273   }
1274   else
1275    endpos= strend;
1276
1277   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1278    "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1279    " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1280    (IV)start_shift, (IV)(check_at - strbeg),
1281    (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1282
1283   s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1284        reginfo);
1285   if (!s) {
1286    if (endpos == strend) {
1287     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1288         "  Could not match STCLASS...\n") );
1289     goto fail;
1290    }
1291    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1292        "  This position contradicts STCLASS...\n") );
1293    if ((prog->intflags & PREGf_ANCH) && !ml_anch
1294       && !(prog->intflags & PREGf_IMPLICIT))
1295     goto fail;
1296
1297    /* Contradict one of substrings */
1298    if (prog->anchored_substr || prog->anchored_utf8) {
1299     if (prog->substrs->check_ix == 1) { /* check is float */
1300      /* Have both, check_string is floating */
1301      assert(rx_origin + start_shift <= check_at);
1302      if (rx_origin + start_shift != check_at) {
1303       /* not at latest position float substr could match:
1304       * Recheck anchored substring, but not floating.
1305       * The condition above is in bytes rather than
1306       * chars for efficiency. It's conservative, in
1307       * that it errs on the side of doing 'goto
1308       * do_other_substr', where a more accurate
1309       * char-based calculation will be done */
1310       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1311         "  Looking for anchored substr starting at offset %ld...\n",
1312         (long)(other_last - strpos)) );
1313       goto do_other_substr;
1314      }
1315     }
1316    }
1317    else {
1318     /* float-only */
1319
1320     if (ml_anch) {
1321      /* In the presence of ml_anch, we might be able to
1322      * find another \n without breaking the current float
1323      * constraint. */
1324
1325      /* strictly speaking this should be HOP3c(..., 1, ...),
1326      * but since we goto a block of code that's going to
1327      * search for the next \n if any, its safe here */
1328      rx_origin++;
1329      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1330        "  Looking for /%s^%s/m starting at offset %ld...\n",
1331        PL_colors[0], PL_colors[1],
1332        (long)(rx_origin - strpos)) );
1333      goto postprocess_substr_matches;
1334     }
1335
1336     /* strictly speaking this can never be true; but might
1337     * be if we ever allow intuit without substrings */
1338     if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1339      goto fail;
1340
1341     rx_origin = rx_max_float;
1342    }
1343
1344    /* at this point, any matching substrings have been
1345    * contradicted. Start again... */
1346
1347    rx_origin = HOP3c(rx_origin, 1, strend);
1348
1349    /* uses bytes rather than char calculations for efficiency.
1350    * It's conservative: it errs on the side of doing 'goto restart',
1351    * where there is code that does a proper char-based test */
1352    if (rx_origin + start_shift + end_shift > strend) {
1353     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1354          "  Could not match STCLASS...\n") );
1355     goto fail;
1356    }
1357    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1358     "  Looking for %s substr starting at offset %ld...\n",
1359     (prog->substrs->check_ix ? "floating" : "anchored"),
1360     (long)(rx_origin + start_shift - strpos)) );
1361    goto restart;
1362   }
1363
1364   /* Success !!! */
1365
1366   if (rx_origin != s) {
1367    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1368       "  By STCLASS: moving %ld --> %ld\n",
1369         (long)(rx_origin - strpos), (long)(s - strpos))
1370     );
1371   }
1372   else {
1373    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1374         "  Does not contradict STCLASS...\n");
1375     );
1376   }
1377  }
1378
1379  /* Decide whether using the substrings helped */
1380
1381  if (rx_origin != strpos) {
1382   /* Fixed substring is found far enough so that the match
1383   cannot start at strpos. */
1384
1385   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  try at offset...\n"));
1386   ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1387  }
1388  else {
1389   /* The found rx_origin position does not prohibit matching at
1390   * strpos, so calling intuit didn't gain us anything. Decrement
1391   * the BmUSEFUL() count on the check substring, and if we reach
1392   * zero, free it.  */
1393   if (!(prog->intflags & PREGf_NAUGHTY)
1394    && (utf8_target ? (
1395     prog->check_utf8  /* Could be deleted already */
1396     && --BmUSEFUL(prog->check_utf8) < 0
1397     && (prog->check_utf8 == prog->float_utf8)
1398    ) : (
1399     prog->check_substr  /* Could be deleted already */
1400     && --BmUSEFUL(prog->check_substr) < 0
1401     && (prog->check_substr == prog->float_substr)
1402    )))
1403   {
1404    /* If flags & SOMETHING - do not do it many times on the same match */
1405    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  ... Disabling check substring...\n"));
1406    /* XXX Does the destruction order has to change with utf8_target? */
1407    SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1408    SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1409    prog->check_substr = prog->check_utf8 = NULL; /* disable */
1410    prog->float_substr = prog->float_utf8 = NULL; /* clear */
1411    check = NULL;   /* abort */
1412    /* XXXX This is a remnant of the old implementation.  It
1413      looks wasteful, since now INTUIT can use many
1414      other heuristics. */
1415    prog->extflags &= ~RXf_USE_INTUIT;
1416   }
1417  }
1418
1419  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1420    "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1421    PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1422
1423  return rx_origin;
1424
1425   fail_finish:    /* Substring not found */
1426  if (prog->check_substr || prog->check_utf8)  /* could be removed already */
1427   BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1428   fail:
1429  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1430       PL_colors[4], PL_colors[5]));
1431  return NULL;
1432 }
1433
1434
1435 #define DECL_TRIE_TYPE(scan) \
1436  const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1437     trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
1438      trie_type = ((scan->flags == EXACT) \
1439        ? (utf8_target ? trie_utf8 : trie_plain) \
1440        : (scan->flags == EXACTFA) \
1441         ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1442         : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1443
1444 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1445 STMT_START {                                                                        \
1446  STRLEN skiplen;                                                                 \
1447  U8 flags = FOLD_FLAGS_FULL;                                                     \
1448  switch (trie_type) {                                                            \
1449  case trie_utf8_exactfa_fold:                                                    \
1450   flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1451   /* FALLTHROUGH */                                                          \
1452  case trie_utf8_fold:                                                            \
1453   if ( foldlen>0 ) {                                                          \
1454    uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1455    foldlen -= len;                                                         \
1456    uscan += len;                                                           \
1457    len=0;                                                                  \
1458   } else {                                                                    \
1459    uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1460    len = UTF8SKIP(uc);                                                     \
1461    skiplen = UNISKIP( uvc );                                               \
1462    foldlen -= skiplen;                                                     \
1463    uscan = foldbuf + skiplen;                                              \
1464   }                                                                           \
1465   break;                                                                      \
1466  case trie_latin_utf8_exactfa_fold:                                              \
1467   flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1468   /* FALLTHROUGH */                                                          \
1469  case trie_latin_utf8_fold:                                                      \
1470   if ( foldlen>0 ) {                                                          \
1471    uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1472    foldlen -= len;                                                         \
1473    uscan += len;                                                           \
1474    len=0;                                                                  \
1475   } else {                                                                    \
1476    len = 1;                                                                \
1477    uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1478    skiplen = UNISKIP( uvc );                                               \
1479    foldlen -= skiplen;                                                     \
1480    uscan = foldbuf + skiplen;                                              \
1481   }                                                                           \
1482   break;                                                                      \
1483  case trie_utf8:                                                                 \
1484   uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1485   break;                                                                      \
1486  case trie_plain:                                                                \
1487   uvc = (UV)*uc;                                                              \
1488   len = 1;                                                                    \
1489  }                                                                               \
1490  if (uvc < 256) {                                                                \
1491   charid = trie->charmap[ uvc ];                                              \
1492  }                                                                               \
1493  else {                                                                          \
1494   charid = 0;                                                                 \
1495   if (widecharmap) {                                                          \
1496    SV** const svpp = hv_fetch(widecharmap,                                 \
1497       (char*)&uvc, sizeof(UV), 0);                                \
1498    if (svpp)                                                               \
1499     charid = (U16)SvIV(*svpp);                                          \
1500   }                                                                           \
1501  }                                                                               \
1502 } STMT_END
1503
1504 #define DUMP_EXEC_POS(li,s,doutf8)                          \
1505  dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1506     startpos, doutf8)
1507
1508 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1509 STMT_START {                                              \
1510  while (s <= e) {                                      \
1511   if ( (COND)                                       \
1512    && (ln == 1 || folder(s, pat_string, ln))    \
1513    && (reginfo->intuit || regtry(reginfo, &s)) )\
1514    goto got_it;                                  \
1515   s++;                                              \
1516  }                                                     \
1517 } STMT_END
1518
1519 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1520 STMT_START {                                          \
1521  while (s < strend) {                              \
1522   CODE                                          \
1523   s += UTF8SKIP(s);                             \
1524  }                                                 \
1525 } STMT_END
1526
1527 #define REXEC_FBC_SCAN(CODE)                          \
1528 STMT_START {                                          \
1529  while (s < strend) {                              \
1530   CODE                                          \
1531   s++;                                          \
1532  }                                                 \
1533 } STMT_END
1534
1535 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1536 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1537  if (COND) {                                                \
1538   if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1539    goto got_it;                                       \
1540   else                                                   \
1541    tmp = doevery;                                     \
1542  }                                                          \
1543  else                                                       \
1544   tmp = 1;                                               \
1545 )
1546
1547 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1548 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1549  if (COND) {                                                \
1550   if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1551    goto got_it;                                       \
1552   else                                                   \
1553    tmp = doevery;                                     \
1554  }                                                          \
1555  else                                                       \
1556   tmp = 1;                                               \
1557 )
1558
1559 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1560  if (utf8_target) {                                         \
1561   REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1562  }                                                          \
1563  else {                                                     \
1564   REXEC_FBC_CLASS_SCAN(COND);                            \
1565  }
1566
1567 /* The three macros below are slightly different versions of the same logic.
1568  *
1569  * The first is for /a and /aa when the target string is UTF-8.  This can only
1570  * match ascii, but it must advance based on UTF-8.   The other two handle the
1571  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1572  * for the boundary (or non-boundary) between a word and non-word character.
1573  * The utf8 and non-utf8 cases have the same logic, but the details must be
1574  * different.  Find the "wordness" of the character just prior to this one, and
1575  * compare it with the wordness of this one.  If they differ, we have a
1576  * boundary.  At the beginning of the string, pretend that the previous
1577  * character was a new-line.
1578  *
1579  * All these macros uncleanly have side-effects with each other and outside
1580  * variables.  So far it's been too much trouble to clean-up
1581  *
1582  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1583  *               a word character or not.
1584  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1585  *               word/non-word
1586  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1587  *
1588  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1589  * are looking for a boundary or for a non-boundary.  If we are looking for a
1590  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1591  * see if this tentative match actually works, and if so, to quit the loop
1592  * here.  And vice-versa if we are looking for a non-boundary.
1593  *
1594  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1595  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1596  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1597  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1598  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1599  * complement.  But in that branch we complement tmp, meaning that at the
1600  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1601  * which means at the top of the loop in the next iteration, it is
1602  * TEST_NON_UTF8(s-1) */
1603 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1604  tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1605  tmp = TEST_NON_UTF8(tmp);                                                  \
1606  REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1607   if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1608    tmp = !tmp;                                                        \
1609    IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1610   }                                                                      \
1611   else {                                                                 \
1612    IF_FAIL;                                                           \
1613   }                                                                      \
1614  );                                                                         \
1615
1616 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1617  * TEST_UTF8 is a macro that for the same input code points returns identically
1618  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1619 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1620  if (s == reginfo->strbeg) {                                                \
1621   tmp = '\n';                                                            \
1622  }                                                                          \
1623  else { /* Back-up to the start of the previous character */                \
1624   U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1625   tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1626              0, UTF8_ALLOW_DEFAULT); \
1627  }                                                                          \
1628  tmp = TEST_UV(tmp);                                                        \
1629  LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1630  REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1631   if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
1632    tmp = !tmp;                                                        \
1633    IF_SUCCESS;                                                        \
1634   }                                                                      \
1635   else {                                                                 \
1636    IF_FAIL;                                                           \
1637   }                                                                      \
1638  );
1639
1640 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1641  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1642  * macros below */
1643 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1644  if (utf8_target) {                                                         \
1645   UTF8_CODE                                                              \
1646  }                                                                          \
1647  else {  /* Not utf8 */                                                     \
1648   tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1649   tmp = TEST_NON_UTF8(tmp);                                              \
1650   REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1651    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1652     IF_SUCCESS;                                                    \
1653     tmp = !tmp;                                                    \
1654    }                                                                  \
1655    else {                                                             \
1656     IF_FAIL;                                                       \
1657    }                                                                  \
1658   );                                                                     \
1659  }                                                                          \
1660  /* Here, things have been set up by the previous code so that tmp is the   \
1661  * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1662  * utf8ness of the target).  We also have to check if this matches against \
1663  * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1664  * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1665  * string */                                                               \
1666  if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1667   IF_SUCCESS;                                                            \
1668  }                                                                          \
1669  else {                                                                     \
1670   IF_FAIL;                                                               \
1671  }
1672
1673 /* This is the macro to use when we want to see if something that looks like it
1674  * could match, actually does, and if so exits the loop */
1675 #define REXEC_FBC_TRYIT                            \
1676  if ((reginfo->intuit || regtry(reginfo, &s)))  \
1677   goto got_it
1678
1679 /* The only difference between the BOUND and NBOUND cases is that
1680  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1681  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1682  * with the other one being empty (PLACEHOLDER is defined as empty).
1683  *
1684  * The TEST_FOO parameters are for operating on different forms of input, but
1685  * all should be ones that return identically for the same underlying code
1686  * points */
1687 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1688  FBC_BOUND_COMMON(                                                          \
1689   FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1690   TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1691
1692 #define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                         \
1693  FBC_BOUND_COMMON(                                                          \
1694    FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1695    TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1696
1697 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1698  FBC_BOUND_COMMON(                                                          \
1699   FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1700   TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1701
1702 #define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                        \
1703  FBC_BOUND_COMMON(                                                          \
1704    FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1705    TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1706
1707
1708 /* We know what class REx starts with.  Try to find this position... */
1709 /* if reginfo->intuit, its a dryrun */
1710 /* annoyingly all the vars in this routine have different names from their counterparts
1711    in regmatch. /grrr */
1712 STATIC char *
1713 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1714  const char *strend, regmatch_info *reginfo)
1715 {
1716  dVAR;
1717  const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1718  char *pat_string;   /* The pattern's exactish string */
1719  char *pat_end;     /* ptr to end char of pat_string */
1720  re_fold_t folder; /* Function for computing non-utf8 folds */
1721  const U8 *fold_array;   /* array for folding ords < 256 */
1722  STRLEN ln;
1723  STRLEN lnc;
1724  U8 c1;
1725  U8 c2;
1726  char *e;
1727  I32 tmp = 1; /* Scratch variable? */
1728  const bool utf8_target = reginfo->is_utf8_target;
1729  UV utf8_fold_flags = 0;
1730  const bool is_utf8_pat = reginfo->is_utf8_pat;
1731  bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1732         with a result inverts that result, as 0^1 =
1733         1 and 1^1 = 0 */
1734  _char_class_number classnum;
1735
1736  RXi_GET_DECL(prog,progi);
1737
1738  PERL_ARGS_ASSERT_FIND_BYCLASS;
1739
1740  /* We know what class it must start with. */
1741  switch (OP(c)) {
1742  case ANYOF:
1743   if (utf8_target) {
1744    REXEC_FBC_UTF8_CLASS_SCAN(
1745      reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1746   }
1747   else {
1748    REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1749   }
1750   break;
1751  case CANY:
1752   REXEC_FBC_SCAN(
1753    if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1754     goto got_it;
1755    else
1756     tmp = doevery;
1757   );
1758   break;
1759
1760  case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1761   assert(! is_utf8_pat);
1762   /* FALLTHROUGH */
1763  case EXACTFA:
1764   if (is_utf8_pat || utf8_target) {
1765    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1766    goto do_exactf_utf8;
1767   }
1768   fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1769   folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1770   goto do_exactf_non_utf8; /* isn't dealt with by these */
1771
1772  case EXACTF:   /* This node only generated for non-utf8 patterns */
1773   assert(! is_utf8_pat);
1774   if (utf8_target) {
1775    utf8_fold_flags = 0;
1776    goto do_exactf_utf8;
1777   }
1778   fold_array = PL_fold;
1779   folder = foldEQ;
1780   goto do_exactf_non_utf8;
1781
1782  case EXACTFL:
1783   if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1784    utf8_fold_flags = FOLDEQ_LOCALE;
1785    goto do_exactf_utf8;
1786   }
1787   fold_array = PL_fold_locale;
1788   folder = foldEQ_locale;
1789   goto do_exactf_non_utf8;
1790
1791  case EXACTFU_SS:
1792   if (is_utf8_pat) {
1793    utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1794   }
1795   goto do_exactf_utf8;
1796
1797  case EXACTFU:
1798   if (is_utf8_pat || utf8_target) {
1799    utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1800    goto do_exactf_utf8;
1801   }
1802
1803   /* Any 'ss' in the pattern should have been replaced by regcomp,
1804   * so we don't have to worry here about this single special case
1805   * in the Latin1 range */
1806   fold_array = PL_fold_latin1;
1807   folder = foldEQ_latin1;
1808
1809   /* FALLTHROUGH */
1810
1811  do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1812       are no glitches with fold-length differences
1813       between the target string and pattern */
1814
1815   /* The idea in the non-utf8 EXACTF* cases is to first find the
1816   * first character of the EXACTF* node and then, if necessary,
1817   * case-insensitively compare the full text of the node.  c1 is the
1818   * first character.  c2 is its fold.  This logic will not work for
1819   * Unicode semantics and the german sharp ss, which hence should
1820   * not be compiled into a node that gets here. */
1821   pat_string = STRING(c);
1822   ln  = STR_LEN(c); /* length to match in octets/bytes */
1823
1824   /* We know that we have to match at least 'ln' bytes (which is the
1825   * same as characters, since not utf8).  If we have to match 3
1826   * characters, and there are only 2 availabe, we know without
1827   * trying that it will fail; so don't start a match past the
1828   * required minimum number from the far end */
1829   e = HOP3c(strend, -((SSize_t)ln), s);
1830
1831   if (reginfo->intuit && e < s) {
1832    e = s;   /* Due to minlen logic of intuit() */
1833   }
1834
1835   c1 = *pat_string;
1836   c2 = fold_array[c1];
1837   if (c1 == c2) { /* If char and fold are the same */
1838    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1839   }
1840   else {
1841    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1842   }
1843   break;
1844
1845  do_exactf_utf8:
1846  {
1847   unsigned expansion;
1848
1849   /* If one of the operands is in utf8, we can't use the simpler folding
1850   * above, due to the fact that many different characters can have the
1851   * same fold, or portion of a fold, or different- length fold */
1852   pat_string = STRING(c);
1853   ln  = STR_LEN(c); /* length to match in octets/bytes */
1854   pat_end = pat_string + ln;
1855   lnc = is_utf8_pat       /* length to match in characters */
1856     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1857     : ln;
1858
1859   /* We have 'lnc' characters to match in the pattern, but because of
1860   * multi-character folding, each character in the target can match
1861   * up to 3 characters (Unicode guarantees it will never exceed
1862   * this) if it is utf8-encoded; and up to 2 if not (based on the
1863   * fact that the Latin 1 folds are already determined, and the
1864   * only multi-char fold in that range is the sharp-s folding to
1865   * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1866   * string character.  Adjust lnc accordingly, rounding up, so that
1867   * if we need to match at least 4+1/3 chars, that really is 5. */
1868   expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1869   lnc = (lnc + expansion - 1) / expansion;
1870
1871   /* As in the non-UTF8 case, if we have to match 3 characters, and
1872   * only 2 are left, it's guaranteed to fail, so don't start a
1873   * match that would require us to go beyond the end of the string
1874   */
1875   e = HOP3c(strend, -((SSize_t)lnc), s);
1876
1877   if (reginfo->intuit && e < s) {
1878    e = s;   /* Due to minlen logic of intuit() */
1879   }
1880
1881   /* XXX Note that we could recalculate e to stop the loop earlier,
1882   * as the worst case expansion above will rarely be met, and as we
1883   * go along we would usually find that e moves further to the left.
1884   * This would happen only after we reached the point in the loop
1885   * where if there were no expansion we should fail.  Unclear if
1886   * worth the expense */
1887
1888   while (s <= e) {
1889    char *my_strend= (char *)strend;
1890    if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1891     pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1892     && (reginfo->intuit || regtry(reginfo, &s)) )
1893    {
1894     goto got_it;
1895    }
1896    s += (utf8_target) ? UTF8SKIP(s) : 1;
1897   }
1898   break;
1899  }
1900
1901  case BOUNDL:
1902   FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1903   break;
1904  case NBOUNDL:
1905   FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1906   break;
1907  case BOUND:
1908   FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
1909   break;
1910  case BOUNDA:
1911   FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
1912   break;
1913  case NBOUND:
1914   FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
1915   break;
1916  case NBOUNDA:
1917   FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
1918   break;
1919  case BOUNDU:
1920   FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
1921   break;
1922  case NBOUNDU:
1923   FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
1924   break;
1925  case LNBREAK:
1926   REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1927       is_LNBREAK_latin1_safe(s, strend)
1928   );
1929   break;
1930
1931  /* The argument to all the POSIX node types is the class number to pass to
1932  * _generic_isCC() to build a mask for searching in PL_charclass[] */
1933
1934  case NPOSIXL:
1935   to_complement = 1;
1936   /* FALLTHROUGH */
1937
1938  case POSIXL:
1939   REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1940       to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1941   break;
1942
1943  case NPOSIXD:
1944   to_complement = 1;
1945   /* FALLTHROUGH */
1946
1947  case POSIXD:
1948   if (utf8_target) {
1949    goto posix_utf8;
1950   }
1951   goto posixa;
1952
1953  case NPOSIXA:
1954   if (utf8_target) {
1955    /* The complement of something that matches only ASCII matches all
1956    * non-ASCII, plus everything in ASCII that isn't in the class. */
1957    REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
1958          || ! _generic_isCC_A(*s, FLAGS(c)));
1959    break;
1960   }
1961
1962   to_complement = 1;
1963   /* FALLTHROUGH */
1964
1965  case POSIXA:
1966  posixa:
1967   /* Don't need to worry about utf8, as it can match only a single
1968   * byte invariant character. */
1969   REXEC_FBC_CLASS_SCAN(
1970       to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1971   break;
1972
1973  case NPOSIXU:
1974   to_complement = 1;
1975   /* FALLTHROUGH */
1976
1977  case POSIXU:
1978   if (! utf8_target) {
1979    REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1980                  FLAGS(c))));
1981   }
1982   else {
1983
1984  posix_utf8:
1985    classnum = (_char_class_number) FLAGS(c);
1986    if (classnum < _FIRST_NON_SWASH_CC) {
1987     while (s < strend) {
1988
1989      /* We avoid loading in the swash as long as possible, but
1990      * should we have to, we jump to a separate loop.  This
1991      * extra 'if' statement is what keeps this code from being
1992      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1993      if (UTF8_IS_ABOVE_LATIN1(*s)) {
1994       goto found_above_latin1;
1995      }
1996      if ((UTF8_IS_INVARIANT(*s)
1997       && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1998                 classnum)))
1999       || (UTF8_IS_DOWNGRADEABLE_START(*s)
2000        && to_complement ^ cBOOL(
2001         _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
2002                  *(s + 1)),
2003            classnum))))
2004      {
2005       if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2006        goto got_it;
2007       else {
2008        tmp = doevery;
2009       }
2010      }
2011      else {
2012       tmp = 1;
2013      }
2014      s += UTF8SKIP(s);
2015     }
2016    }
2017    else switch (classnum) {    /* These classes are implemented as
2018           macros */
2019     case _CC_ENUM_SPACE: /* XXX would require separate code if we
2020           revert the change of \v matching this */
2021      /* FALLTHROUGH */
2022
2023     case _CC_ENUM_PSXSPC:
2024      REXEC_FBC_UTF8_CLASS_SCAN(
2025           to_complement ^ cBOOL(isSPACE_utf8(s)));
2026      break;
2027
2028     case _CC_ENUM_BLANK:
2029      REXEC_FBC_UTF8_CLASS_SCAN(
2030           to_complement ^ cBOOL(isBLANK_utf8(s)));
2031      break;
2032
2033     case _CC_ENUM_XDIGIT:
2034      REXEC_FBC_UTF8_CLASS_SCAN(
2035          to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2036      break;
2037
2038     case _CC_ENUM_VERTSPACE:
2039      REXEC_FBC_UTF8_CLASS_SCAN(
2040          to_complement ^ cBOOL(isVERTWS_utf8(s)));
2041      break;
2042
2043     case _CC_ENUM_CNTRL:
2044      REXEC_FBC_UTF8_CLASS_SCAN(
2045           to_complement ^ cBOOL(isCNTRL_utf8(s)));
2046      break;
2047
2048     default:
2049      Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2050      assert(0); /* NOTREACHED */
2051    }
2052   }
2053   break;
2054
2055  found_above_latin1:   /* Here we have to load a swash to get the result
2056        for the current code point */
2057   if (! PL_utf8_swash_ptrs[classnum]) {
2058    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2059    PL_utf8_swash_ptrs[classnum] =
2060      _core_swash_init("utf8",
2061          "",
2062          &PL_sv_undef, 1, 0,
2063          PL_XPosix_ptrs[classnum], &flags);
2064   }
2065
2066   /* This is a copy of the loop above for swash classes, though using the
2067   * FBC macro instead of being expanded out.  Since we've loaded the
2068   * swash, we don't have to check for that each time through the loop */
2069   REXEC_FBC_UTF8_CLASS_SCAN(
2070     to_complement ^ cBOOL(_generic_utf8(
2071          classnum,
2072          s,
2073          swash_fetch(PL_utf8_swash_ptrs[classnum],
2074             (U8 *) s, TRUE))));
2075   break;
2076
2077  case AHOCORASICKC:
2078  case AHOCORASICK:
2079   {
2080    DECL_TRIE_TYPE(c);
2081    /* what trie are we using right now */
2082    reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2083    reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2084    HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2085
2086    const char *last_start = strend - trie->minlen;
2087 #ifdef DEBUGGING
2088    const char *real_start = s;
2089 #endif
2090    STRLEN maxlen = trie->maxlen;
2091    SV *sv_points;
2092    U8 **points; /* map of where we were in the input string
2093        when reading a given char. For ASCII this
2094        is unnecessary overhead as the relationship
2095        is always 1:1, but for Unicode, especially
2096        case folded Unicode this is not true. */
2097    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2098    U8 *bitmap=NULL;
2099
2100
2101    GET_RE_DEBUG_FLAGS_DECL;
2102
2103    /* We can't just allocate points here. We need to wrap it in
2104    * an SV so it gets freed properly if there is a croak while
2105    * running the match */
2106    ENTER;
2107    SAVETMPS;
2108    sv_points=newSV(maxlen * sizeof(U8 *));
2109    SvCUR_set(sv_points,
2110     maxlen * sizeof(U8 *));
2111    SvPOK_on(sv_points);
2112    sv_2mortal(sv_points);
2113    points=(U8**)SvPV_nolen(sv_points );
2114    if ( trie_type != trie_utf8_fold
2115     && (trie->bitmap || OP(c)==AHOCORASICKC) )
2116    {
2117     if (trie->bitmap)
2118      bitmap=(U8*)trie->bitmap;
2119     else
2120      bitmap=(U8*)ANYOF_BITMAP(c);
2121    }
2122    /* this is the Aho-Corasick algorithm modified a touch
2123    to include special handling for long "unknown char" sequences.
2124    The basic idea being that we use AC as long as we are dealing
2125    with a possible matching char, when we encounter an unknown char
2126    (and we have not encountered an accepting state) we scan forward
2127    until we find a legal starting char.
2128    AC matching is basically that of trie matching, except that when
2129    we encounter a failing transition, we fall back to the current
2130    states "fail state", and try the current char again, a process
2131    we repeat until we reach the root state, state 1, or a legal
2132    transition. If we fail on the root state then we can either
2133    terminate if we have reached an accepting state previously, or
2134    restart the entire process from the beginning if we have not.
2135
2136    */
2137    while (s <= last_start) {
2138     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2139     U8 *uc = (U8*)s;
2140     U16 charid = 0;
2141     U32 base = 1;
2142     U32 state = 1;
2143     UV uvc = 0;
2144     STRLEN len = 0;
2145     STRLEN foldlen = 0;
2146     U8 *uscan = (U8*)NULL;
2147     U8 *leftmost = NULL;
2148 #ifdef DEBUGGING
2149     U32 accepted_word= 0;
2150 #endif
2151     U32 pointpos = 0;
2152
2153     while ( state && uc <= (U8*)strend ) {
2154      int failed=0;
2155      U32 word = aho->states[ state ].wordnum;
2156
2157      if( state==1 ) {
2158       if ( bitmap ) {
2159        DEBUG_TRIE_EXECUTE_r(
2160         if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2161          dump_exec_pos( (char *)uc, c, strend, real_start,
2162           (char *)uc, utf8_target );
2163          PerlIO_printf( Perl_debug_log,
2164           " Scanning for legal start char...\n");
2165         }
2166        );
2167        if (utf8_target) {
2168         while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2169          uc += UTF8SKIP(uc);
2170         }
2171        } else {
2172         while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2173          uc++;
2174         }
2175        }
2176        s= (char *)uc;
2177       }
2178       if (uc >(U8*)last_start) break;
2179      }
2180
2181      if ( word ) {
2182       U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2183       if (!leftmost || lpos < leftmost) {
2184        DEBUG_r(accepted_word=word);
2185        leftmost= lpos;
2186       }
2187       if (base==0) break;
2188
2189      }
2190      points[pointpos++ % maxlen]= uc;
2191      if (foldlen || uc < (U8*)strend) {
2192       REXEC_TRIE_READ_CHAR(trie_type, trie,
2193           widecharmap, uc,
2194           uscan, len, uvc, charid, foldlen,
2195           foldbuf, uniflags);
2196       DEBUG_TRIE_EXECUTE_r({
2197        dump_exec_pos( (char *)uc, c, strend,
2198           real_start, s, utf8_target);
2199        PerlIO_printf(Perl_debug_log,
2200         " Charid:%3u CP:%4"UVxf" ",
2201         charid, uvc);
2202       });
2203      }
2204      else {
2205       len = 0;
2206       charid = 0;
2207      }
2208
2209
2210      do {
2211 #ifdef DEBUGGING
2212       word = aho->states[ state ].wordnum;
2213 #endif
2214       base = aho->states[ state ].trans.base;
2215
2216       DEBUG_TRIE_EXECUTE_r({
2217        if (failed)
2218         dump_exec_pos( (char *)uc, c, strend, real_start,
2219          s,   utf8_target );
2220        PerlIO_printf( Perl_debug_log,
2221         "%sState: %4"UVxf", word=%"UVxf,
2222         failed ? " Fail transition to " : "",
2223         (UV)state, (UV)word);
2224       });
2225       if ( base ) {
2226        U32 tmp;
2227        I32 offset;
2228        if (charid &&
2229         ( ((offset = base + charid
2230          - 1 - trie->uniquecharcount)) >= 0)
2231         && ((U32)offset < trie->lasttrans)
2232         && trie->trans[offset].check == state
2233         && (tmp=trie->trans[offset].next))
2234        {
2235         DEBUG_TRIE_EXECUTE_r(
2236          PerlIO_printf( Perl_debug_log," - legal\n"));
2237         state = tmp;
2238         break;
2239        }
2240        else {
2241         DEBUG_TRIE_EXECUTE_r(
2242          PerlIO_printf( Perl_debug_log," - fail\n"));
2243         failed = 1;
2244         state = aho->fail[state];
2245        }
2246       }
2247       else {
2248        /* we must be accepting here */
2249        DEBUG_TRIE_EXECUTE_r(
2250          PerlIO_printf( Perl_debug_log," - accepting\n"));
2251        failed = 1;
2252        break;
2253       }
2254      } while(state);
2255      uc += len;
2256      if (failed) {
2257       if (leftmost)
2258        break;
2259       if (!state) state = 1;
2260      }
2261     }
2262     if ( aho->states[ state ].wordnum ) {
2263      U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2264      if (!leftmost || lpos < leftmost) {
2265       DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2266       leftmost = lpos;
2267      }
2268     }
2269     if (leftmost) {
2270      s = (char*)leftmost;
2271      DEBUG_TRIE_EXECUTE_r({
2272       PerlIO_printf(
2273        Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2274        (UV)accepted_word, (IV)(s - real_start)
2275       );
2276      });
2277      if (reginfo->intuit || regtry(reginfo, &s)) {
2278       FREETMPS;
2279       LEAVE;
2280       goto got_it;
2281      }
2282      s = HOPc(s,1);
2283      DEBUG_TRIE_EXECUTE_r({
2284       PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2285      });
2286     } else {
2287      DEBUG_TRIE_EXECUTE_r(
2288       PerlIO_printf( Perl_debug_log,"No match.\n"));
2289      break;
2290     }
2291    }
2292    FREETMPS;
2293    LEAVE;
2294   }
2295   break;
2296  default:
2297   Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2298  }
2299  return 0;
2300   got_it:
2301  return s;
2302 }
2303
2304 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2305  * flags have same meanings as with regexec_flags() */
2306
2307 static void
2308 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2309        char *strbeg,
2310        char *strend,
2311        SV *sv,
2312        U32 flags,
2313        bool utf8_target)
2314 {
2315  struct regexp *const prog = ReANY(rx);
2316
2317  if (flags & REXEC_COPY_STR) {
2318 #ifdef PERL_ANY_COW
2319   if (SvCANCOW(sv)) {
2320    if (DEBUG_C_TEST) {
2321     PerlIO_printf(Perl_debug_log,
2322        "Copy on write: regexp capture, type %d\n",
2323        (int) SvTYPE(sv));
2324    }
2325    /* Create a new COW SV to share the match string and store
2326    * in saved_copy, unless the current COW SV in saved_copy
2327    * is valid and suitable for our purpose */
2328    if ((   prog->saved_copy
2329     && SvIsCOW(prog->saved_copy)
2330     && SvPOKp(prog->saved_copy)
2331     && SvIsCOW(sv)
2332     && SvPOKp(sv)
2333     && SvPVX(sv) == SvPVX(prog->saved_copy)))
2334    {
2335     /* just reuse saved_copy SV */
2336     if (RXp_MATCH_COPIED(prog)) {
2337      Safefree(prog->subbeg);
2338      RXp_MATCH_COPIED_off(prog);
2339     }
2340    }
2341    else {
2342     /* create new COW SV to share string */
2343     RX_MATCH_COPY_FREE(rx);
2344     prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2345    }
2346    prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2347    assert (SvPOKp(prog->saved_copy));
2348    prog->sublen  = strend - strbeg;
2349    prog->suboffset = 0;
2350    prog->subcoffset = 0;
2351   } else
2352 #endif
2353   {
2354    SSize_t min = 0;
2355    SSize_t max = strend - strbeg;
2356    SSize_t sublen;
2357
2358    if (    (flags & REXEC_COPY_SKIP_POST)
2359     && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2360     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2361    ) { /* don't copy $' part of string */
2362     U32 n = 0;
2363     max = -1;
2364     /* calculate the right-most part of the string covered
2365     * by a capture. Due to look-ahead, this may be to
2366     * the right of $&, so we have to scan all captures */
2367     while (n <= prog->lastparen) {
2368      if (prog->offs[n].end > max)
2369       max = prog->offs[n].end;
2370      n++;
2371     }
2372     if (max == -1)
2373      max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2374        ? prog->offs[0].start
2375        : 0;
2376     assert(max >= 0 && max <= strend - strbeg);
2377    }
2378
2379    if (    (flags & REXEC_COPY_SKIP_PRE)
2380     && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2381     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2382    ) { /* don't copy $` part of string */
2383     U32 n = 0;
2384     min = max;
2385     /* calculate the left-most part of the string covered
2386     * by a capture. Due to look-behind, this may be to
2387     * the left of $&, so we have to scan all captures */
2388     while (min && n <= prog->lastparen) {
2389      if (   prog->offs[n].start != -1
2390       && prog->offs[n].start < min)
2391      {
2392       min = prog->offs[n].start;
2393      }
2394      n++;
2395     }
2396     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2397      && min >  prog->offs[0].end
2398     )
2399      min = prog->offs[0].end;
2400
2401    }
2402
2403    assert(min >= 0 && min <= max && min <= strend - strbeg);
2404    sublen = max - min;
2405
2406    if (RX_MATCH_COPIED(rx)) {
2407     if (sublen > prog->sublen)
2408      prog->subbeg =
2409        (char*)saferealloc(prog->subbeg, sublen+1);
2410    }
2411    else
2412     prog->subbeg = (char*)safemalloc(sublen+1);
2413    Copy(strbeg + min, prog->subbeg, sublen, char);
2414    prog->subbeg[sublen] = '\0';
2415    prog->suboffset = min;
2416    prog->sublen = sublen;
2417    RX_MATCH_COPIED_on(rx);
2418   }
2419   prog->subcoffset = prog->suboffset;
2420   if (prog->suboffset && utf8_target) {
2421    /* Convert byte offset to chars.
2422    * XXX ideally should only compute this if @-/@+
2423    * has been seen, a la PL_sawampersand ??? */
2424
2425    /* If there's a direct correspondence between the
2426    * string which we're matching and the original SV,
2427    * then we can use the utf8 len cache associated with
2428    * the SV. In particular, it means that under //g,
2429    * sv_pos_b2u() will use the previously cached
2430    * position to speed up working out the new length of
2431    * subcoffset, rather than counting from the start of
2432    * the string each time. This stops
2433    *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2434    * from going quadratic */
2435    if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2436     prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2437             SV_GMAGIC|SV_CONST_RETURN);
2438    else
2439     prog->subcoffset = utf8_length((U8*)strbeg,
2440          (U8*)(strbeg+prog->suboffset));
2441   }
2442  }
2443  else {
2444   RX_MATCH_COPY_FREE(rx);
2445   prog->subbeg = strbeg;
2446   prog->suboffset = 0;
2447   prog->subcoffset = 0;
2448   prog->sublen = strend - strbeg;
2449  }
2450 }
2451
2452
2453
2454
2455 /*
2456  - regexec_flags - match a regexp against a string
2457  */
2458 I32
2459 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2460    char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2461 /* stringarg: the point in the string at which to begin matching */
2462 /* strend:    pointer to null at end of string */
2463 /* strbeg:    real beginning of string */
2464 /* minend:    end of match must be >= minend bytes after stringarg. */
2465 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2466  *            itself is accessed via the pointers above */
2467 /* data:      May be used for some additional optimizations.
2468    Currently unused. */
2469 /* flags:     For optimizations. See REXEC_* in regexp.h */
2470
2471 {
2472  struct regexp *const prog = ReANY(rx);
2473  char *s;
2474  regnode *c;
2475  char *startpos;
2476  SSize_t minlen;  /* must match at least this many chars */
2477  SSize_t dontbother = 0; /* how many characters not to try at end */
2478  const bool utf8_target = cBOOL(DO_UTF8(sv));
2479  I32 multiline;
2480  RXi_GET_DECL(prog,progi);
2481  regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2482  regmatch_info *const reginfo = &reginfo_buf;
2483  regexp_paren_pair *swap = NULL;
2484  I32 oldsave;
2485  GET_RE_DEBUG_FLAGS_DECL;
2486
2487  PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2488  PERL_UNUSED_ARG(data);
2489
2490  /* Be paranoid... */
2491  if (prog == NULL || stringarg == NULL) {
2492   Perl_croak(aTHX_ "NULL regexp parameter");
2493  }
2494
2495  DEBUG_EXECUTE_r(
2496   debug_start_match(rx, utf8_target, stringarg, strend,
2497   "Matching");
2498  );
2499
2500  startpos = stringarg;
2501
2502  if (prog->intflags & PREGf_GPOS_SEEN) {
2503   MAGIC *mg;
2504
2505   /* set reginfo->ganch, the position where \G can match */
2506
2507   reginfo->ganch =
2508    (flags & REXEC_IGNOREPOS)
2509    ? stringarg /* use start pos rather than pos() */
2510    : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2511    /* Defined pos(): */
2512    ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2513    : strbeg; /* pos() not defined; use start of string */
2514
2515   DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2516    "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2517
2518   /* in the presence of \G, we may need to start looking earlier in
2519   * the string than the suggested start point of stringarg:
2520   * if prog->gofs is set, then that's a known, fixed minimum
2521   * offset, such as
2522   * /..\G/:   gofs = 2
2523   * /ab|c\G/: gofs = 1
2524   * or if the minimum offset isn't known, then we have to go back
2525   * to the start of the string, e.g. /w+\G/
2526   */
2527
2528   if (prog->intflags & PREGf_ANCH_GPOS) {
2529    startpos  = reginfo->ganch - prog->gofs;
2530    if (startpos <
2531     ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2532    {
2533     DEBUG_r(PerlIO_printf(Perl_debug_log,
2534       "fail: ganch-gofs before earliest possible start\n"));
2535     return 0;
2536    }
2537   }
2538   else if (prog->gofs) {
2539    if (startpos - prog->gofs < strbeg)
2540     startpos = strbeg;
2541    else
2542     startpos -= prog->gofs;
2543   }
2544   else if (prog->intflags & PREGf_GPOS_FLOAT)
2545    startpos = strbeg;
2546  }
2547
2548  minlen = prog->minlen;
2549  if ((startpos + minlen) > strend || startpos < strbeg) {
2550   DEBUG_r(PerlIO_printf(Perl_debug_log,
2551      "Regex match can't succeed, so not even tried\n"));
2552   return 0;
2553  }
2554
2555  /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2556  * which will call destuctors to reset PL_regmatch_state, free higher
2557  * PL_regmatch_slabs, and clean up regmatch_info_aux and
2558  * regmatch_info_aux_eval */
2559
2560  oldsave = PL_savestack_ix;
2561
2562  s = startpos;
2563
2564  if ((prog->extflags & RXf_USE_INTUIT)
2565   && !(flags & REXEC_CHECKED))
2566  {
2567   s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2568          flags, NULL);
2569   if (!s)
2570    return 0;
2571
2572   if (prog->extflags & RXf_CHECK_ALL) {
2573    /* we can match based purely on the result of INTUIT.
2574    * Set up captures etc just for $& and $-[0]
2575    * (an intuit-only match wont have $1,$2,..) */
2576    assert(!prog->nparens);
2577
2578    /* s/// doesn't like it if $& is earlier than where we asked it to
2579    * start searching (which can happen on something like /.\G/) */
2580    if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2581      && (s < stringarg))
2582    {
2583     /* this should only be possible under \G */
2584     assert(prog->intflags & PREGf_GPOS_SEEN);
2585     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2586      "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2587     goto phooey;
2588    }
2589
2590    /* match via INTUIT shouldn't have any captures.
2591    * Let @-, @+, $^N know */
2592    prog->lastparen = prog->lastcloseparen = 0;
2593    RX_MATCH_UTF8_set(rx, utf8_target);
2594    prog->offs[0].start = s - strbeg;
2595    prog->offs[0].end = utf8_target
2596     ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2597     : s - strbeg + prog->minlenret;
2598    if ( !(flags & REXEC_NOT_FIRST) )
2599     S_reg_set_capture_string(aTHX_ rx,
2600           strbeg, strend,
2601           sv, flags, utf8_target);
2602
2603    return 1;
2604   }
2605  }
2606
2607  multiline = prog->extflags & RXf_PMf_MULTILINE;
2608
2609  if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2610   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2611        "String too short [regexec_flags]...\n"));
2612   goto phooey;
2613  }
2614
2615  /* Check validity of program. */
2616  if (UCHARAT(progi->program) != REG_MAGIC) {
2617   Perl_croak(aTHX_ "corrupted regexp program");
2618  }
2619
2620  RX_MATCH_TAINTED_off(rx);
2621
2622  reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2623  reginfo->intuit = 0;
2624  reginfo->is_utf8_target = cBOOL(utf8_target);
2625  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2626  reginfo->warned = FALSE;
2627  reginfo->strbeg  = strbeg;
2628  reginfo->sv = sv;
2629  reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2630  reginfo->strend = strend;
2631  /* see how far we have to get to not match where we matched before */
2632  reginfo->till = stringarg + minend;
2633
2634  if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2635   /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2636   S_cleanup_regmatch_info_aux has executed (registered by
2637   SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2638   magic belonging to this SV.
2639   Not newSVsv, either, as it does not COW.
2640   */
2641   reginfo->sv = newSV(0);
2642   SvSetSV_nosteal(reginfo->sv, sv);
2643   SAVEFREESV(reginfo->sv);
2644  }
2645
2646  /* reserve next 2 or 3 slots in PL_regmatch_state:
2647  * slot N+0: may currently be in use: skip it
2648  * slot N+1: use for regmatch_info_aux struct
2649  * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2650  * slot N+3: ready for use by regmatch()
2651  */
2652
2653  {
2654   regmatch_state *old_regmatch_state;
2655   regmatch_slab  *old_regmatch_slab;
2656   int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2657
2658   /* on first ever match, allocate first slab */
2659   if (!PL_regmatch_slab) {
2660    Newx(PL_regmatch_slab, 1, regmatch_slab);
2661    PL_regmatch_slab->prev = NULL;
2662    PL_regmatch_slab->next = NULL;
2663    PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2664   }
2665
2666   old_regmatch_state = PL_regmatch_state;
2667   old_regmatch_slab  = PL_regmatch_slab;
2668
2669   for (i=0; i <= max; i++) {
2670    if (i == 1)
2671     reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2672    else if (i ==2)
2673     reginfo->info_aux_eval =
2674     reginfo->info_aux->info_aux_eval =
2675        &(PL_regmatch_state->u.info_aux_eval);
2676
2677    if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2678     PL_regmatch_state = S_push_slab(aTHX);
2679   }
2680
2681   /* note initial PL_regmatch_state position; at end of match we'll
2682   * pop back to there and free any higher slabs */
2683
2684   reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2685   reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2686   reginfo->info_aux->poscache = NULL;
2687
2688   SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2689
2690   if ((prog->extflags & RXf_EVAL_SEEN))
2691    S_setup_eval_state(aTHX_ reginfo);
2692   else
2693    reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2694  }
2695
2696  /* If there is a "must appear" string, look for it. */
2697
2698  if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2699   /* We have to be careful. If the previous successful match
2700   was from this regex we don't want a subsequent partially
2701   successful match to clobber the old results.
2702   So when we detect this possibility we add a swap buffer
2703   to the re, and switch the buffer each match. If we fail,
2704   we switch it back; otherwise we leave it swapped.
2705   */
2706   swap = prog->offs;
2707   /* do we need a save destructor here for eval dies? */
2708   Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2709   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2710    "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2711    PTR2UV(prog),
2712    PTR2UV(swap),
2713    PTR2UV(prog->offs)
2714   ));
2715  }
2716
2717  /* Simplest case:  anchored match need be tried only once. */
2718  /*  [unless only anchor is MBOL - implying multiline is set] */
2719  if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
2720   if (s == startpos && regtry(reginfo, &s))
2721    goto got_it;
2722   else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
2723   {
2724    char *end;
2725
2726    if (minlen)
2727     dontbother = minlen - 1;
2728    end = HOP3c(strend, -dontbother, strbeg) - 1;
2729    /* for multiline we only have to try after newlines */
2730    if (prog->check_substr || prog->check_utf8) {
2731     /* because of the goto we can not easily reuse the macros for bifurcating the
2732     unicode/non-unicode match modes here like we do elsewhere - demerphq */
2733     if (utf8_target) {
2734      if (s == startpos)
2735       goto after_try_utf8;
2736      while (1) {
2737       if (regtry(reginfo, &s)) {
2738        goto got_it;
2739       }
2740      after_try_utf8:
2741       if (s > end) {
2742        goto phooey;
2743       }
2744       if (prog->extflags & RXf_USE_INTUIT) {
2745        s = re_intuit_start(rx, sv, strbeg,
2746          s + UTF8SKIP(s), strend, flags, NULL);
2747        if (!s) {
2748         goto phooey;
2749        }
2750       }
2751       else {
2752        s += UTF8SKIP(s);
2753       }
2754      }
2755     } /* end search for check string in unicode */
2756     else {
2757      if (s == startpos) {
2758       goto after_try_latin;
2759      }
2760      while (1) {
2761       if (regtry(reginfo, &s)) {
2762        goto got_it;
2763       }
2764      after_try_latin:
2765       if (s > end) {
2766        goto phooey;
2767       }
2768       if (prog->extflags & RXf_USE_INTUIT) {
2769        s = re_intuit_start(rx, sv, strbeg,
2770           s + 1, strend, flags, NULL);
2771        if (!s) {
2772         goto phooey;
2773        }
2774       }
2775       else {
2776        s++;
2777       }
2778      }
2779     } /* end search for check string in latin*/
2780    } /* end search for check string */
2781    else { /* search for newline */
2782     if (s > startpos) {
2783      /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2784      s--;
2785     }
2786     /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2787     while (s <= end) { /* note it could be possible to match at the end of the string */
2788      if (*s++ == '\n') { /* don't need PL_utf8skip here */
2789       if (regtry(reginfo, &s))
2790        goto got_it;
2791      }
2792     }
2793    } /* end search for newline */
2794   } /* end anchored/multiline check string search */
2795   goto phooey;
2796  } else if (prog->intflags & PREGf_ANCH_GPOS)
2797  {
2798   /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2799   assert(prog->intflags & PREGf_GPOS_SEEN);
2800   /* For anchored \G, the only position it can match from is
2801   * (ganch-gofs); we already set startpos to this above; if intuit
2802   * moved us on from there, we can't possibly succeed */
2803   assert(startpos == reginfo->ganch - prog->gofs);
2804   if (s == startpos && regtry(reginfo, &s))
2805    goto got_it;
2806   goto phooey;
2807  }
2808
2809  /* Messy cases:  unanchored match. */
2810  if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2811   /* we have /x+whatever/ */
2812   /* it must be a one character string (XXXX Except is_utf8_pat?) */
2813   char ch;
2814 #ifdef DEBUGGING
2815   int did_match = 0;
2816 #endif
2817   if (utf8_target) {
2818    if (! prog->anchored_utf8) {
2819     to_utf8_substr(prog);
2820    }
2821    ch = SvPVX_const(prog->anchored_utf8)[0];
2822    REXEC_FBC_SCAN(
2823     if (*s == ch) {
2824      DEBUG_EXECUTE_r( did_match = 1 );
2825      if (regtry(reginfo, &s)) goto got_it;
2826      s += UTF8SKIP(s);
2827      while (s < strend && *s == ch)
2828       s += UTF8SKIP(s);
2829     }
2830    );
2831
2832   }
2833   else {
2834    if (! prog->anchored_substr) {
2835     if (! to_byte_substr(prog)) {
2836      NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2837     }
2838    }
2839    ch = SvPVX_const(prog->anchored_substr)[0];
2840    REXEC_FBC_SCAN(
2841     if (*s == ch) {
2842      DEBUG_EXECUTE_r( did_match = 1 );
2843      if (regtry(reginfo, &s)) goto got_it;
2844      s++;
2845      while (s < strend && *s == ch)
2846       s++;
2847     }
2848    );
2849   }
2850   DEBUG_EXECUTE_r(if (!did_match)
2851     PerlIO_printf(Perl_debug_log,
2852         "Did not find anchored character...\n")
2853    );
2854  }
2855  else if (prog->anchored_substr != NULL
2856    || prog->anchored_utf8 != NULL
2857    || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2858     && prog->float_max_offset < strend - s)) {
2859   SV *must;
2860   SSize_t back_max;
2861   SSize_t back_min;
2862   char *last;
2863   char *last1;  /* Last position checked before */
2864 #ifdef DEBUGGING
2865   int did_match = 0;
2866 #endif
2867   if (prog->anchored_substr || prog->anchored_utf8) {
2868    if (utf8_target) {
2869     if (! prog->anchored_utf8) {
2870      to_utf8_substr(prog);
2871     }
2872     must = prog->anchored_utf8;
2873    }
2874    else {
2875     if (! prog->anchored_substr) {
2876      if (! to_byte_substr(prog)) {
2877       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2878      }
2879     }
2880     must = prog->anchored_substr;
2881    }
2882    back_max = back_min = prog->anchored_offset;
2883   } else {
2884    if (utf8_target) {
2885     if (! prog->float_utf8) {
2886      to_utf8_substr(prog);
2887     }
2888     must = prog->float_utf8;
2889    }
2890    else {
2891     if (! prog->float_substr) {
2892      if (! to_byte_substr(prog)) {
2893       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2894      }
2895     }
2896     must = prog->float_substr;
2897    }
2898    back_max = prog->float_max_offset;
2899    back_min = prog->float_min_offset;
2900   }
2901
2902   if (back_min<0) {
2903    last = strend;
2904   } else {
2905    last = HOP3c(strend, /* Cannot start after this */
2906     -(SSize_t)(CHR_SVLEN(must)
2907       - (SvTAIL(must) != 0) + back_min), strbeg);
2908   }
2909   if (s > reginfo->strbeg)
2910    last1 = HOPc(s, -1);
2911   else
2912    last1 = s - 1; /* bogus */
2913
2914   /* XXXX check_substr already used to find "s", can optimize if
2915   check_substr==must. */
2916   dontbother = 0;
2917   strend = HOPc(strend, -dontbother);
2918   while ( (s <= last) &&
2919     (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
2920         (unsigned char*)strend, must,
2921         multiline ? FBMrf_MULTILINE : 0)) ) {
2922    DEBUG_EXECUTE_r( did_match = 1 );
2923    if (HOPc(s, -back_max) > last1) {
2924     last1 = HOPc(s, -back_min);
2925     s = HOPc(s, -back_max);
2926    }
2927    else {
2928     char * const t = (last1 >= reginfo->strbeg)
2929          ? HOPc(last1, 1) : last1 + 1;
2930
2931     last1 = HOPc(s, -back_min);
2932     s = t;
2933    }
2934    if (utf8_target) {
2935     while (s <= last1) {
2936      if (regtry(reginfo, &s))
2937       goto got_it;
2938      if (s >= last1) {
2939       s++; /* to break out of outer loop */
2940       break;
2941      }
2942      s += UTF8SKIP(s);
2943     }
2944    }
2945    else {
2946     while (s <= last1) {
2947      if (regtry(reginfo, &s))
2948       goto got_it;
2949      s++;
2950     }
2951    }
2952   }
2953   DEBUG_EXECUTE_r(if (!did_match) {
2954    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2955     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2956    PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2957        ((must == prog->anchored_substr || must == prog->anchored_utf8)
2958        ? "anchored" : "floating"),
2959     quoted, RE_SV_TAIL(must));
2960   });
2961   goto phooey;
2962  }
2963  else if ( (c = progi->regstclass) ) {
2964   if (minlen) {
2965    const OPCODE op = OP(progi->regstclass);
2966    /* don't bother with what can't match */
2967    if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2968     strend = HOPc(strend, -(minlen - 1));
2969   }
2970   DEBUG_EXECUTE_r({
2971    SV * const prop = sv_newmortal();
2972    regprop(prog, prop, c, reginfo);
2973    {
2974     RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2975      s,strend-s,60);
2976     PerlIO_printf(Perl_debug_log,
2977      "Matching stclass %.*s against %s (%d bytes)\n",
2978      (int)SvCUR(prop), SvPVX_const(prop),
2979      quoted, (int)(strend - s));
2980    }
2981   });
2982   if (find_byclass(prog, c, s, strend, reginfo))
2983    goto got_it;
2984   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2985  }
2986  else {
2987   dontbother = 0;
2988   if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2989    /* Trim the end. */
2990    char *last= NULL;
2991    SV* float_real;
2992    STRLEN len;
2993    const char *little;
2994
2995    if (utf8_target) {
2996     if (! prog->float_utf8) {
2997      to_utf8_substr(prog);
2998     }
2999     float_real = prog->float_utf8;
3000    }
3001    else {
3002     if (! prog->float_substr) {
3003      if (! to_byte_substr(prog)) {
3004       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3005      }
3006     }
3007     float_real = prog->float_substr;
3008    }
3009
3010    little = SvPV_const(float_real, len);
3011    if (SvTAIL(float_real)) {
3012      /* This means that float_real contains an artificial \n on
3013      * the end due to the presence of something like this:
3014      * /foo$/ where we can match both "foo" and "foo\n" at the
3015      * end of the string.  So we have to compare the end of the
3016      * string first against the float_real without the \n and
3017      * then against the full float_real with the string.  We
3018      * have to watch out for cases where the string might be
3019      * smaller than the float_real or the float_real without
3020      * the \n. */
3021      char *checkpos= strend - len;
3022      DEBUG_OPTIMISE_r(
3023       PerlIO_printf(Perl_debug_log,
3024        "%sChecking for float_real.%s\n",
3025        PL_colors[4], PL_colors[5]));
3026      if (checkpos + 1 < strbeg) {
3027       /* can't match, even if we remove the trailing \n
3028       * string is too short to match */
3029       DEBUG_EXECUTE_r(
3030        PerlIO_printf(Perl_debug_log,
3031         "%sString shorter than required trailing substring, cannot match.%s\n",
3032         PL_colors[4], PL_colors[5]));
3033       goto phooey;
3034      } else if (memEQ(checkpos + 1, little, len - 1)) {
3035       /* can match, the end of the string matches without the
3036       * "\n" */
3037       last = checkpos + 1;
3038      } else if (checkpos < strbeg) {
3039       /* cant match, string is too short when the "\n" is
3040       * included */
3041       DEBUG_EXECUTE_r(
3042        PerlIO_printf(Perl_debug_log,
3043         "%sString does not contain required trailing substring, cannot match.%s\n",
3044         PL_colors[4], PL_colors[5]));
3045       goto phooey;
3046      } else if (!multiline) {
3047       /* non multiline match, so compare with the "\n" at the
3048       * end of the string */
3049       if (memEQ(checkpos, little, len)) {
3050        last= checkpos;
3051       } else {
3052        DEBUG_EXECUTE_r(
3053         PerlIO_printf(Perl_debug_log,
3054          "%sString does not contain required trailing substring, cannot match.%s\n",
3055          PL_colors[4], PL_colors[5]));
3056        goto phooey;
3057       }
3058      } else {
3059       /* multiline match, so we have to search for a place
3060       * where the full string is located */
3061       goto find_last;
3062      }
3063    } else {
3064     find_last:
3065      if (len)
3066       last = rninstr(s, strend, little, little + len);
3067      else
3068       last = strend; /* matching "$" */
3069    }
3070    if (!last) {
3071     /* at one point this block contained a comment which was
3072     * probably incorrect, which said that this was a "should not
3073     * happen" case.  Even if it was true when it was written I am
3074     * pretty sure it is not anymore, so I have removed the comment
3075     * and replaced it with this one. Yves */
3076     DEBUG_EXECUTE_r(
3077      PerlIO_printf(Perl_debug_log,
3078       "String does not contain required substring, cannot match.\n"
3079      ));
3080     goto phooey;
3081    }
3082    dontbother = strend - last + prog->float_min_offset;
3083   }
3084   if (minlen && (dontbother < minlen))
3085    dontbother = minlen - 1;
3086   strend -= dontbother;      /* this one's always in bytes! */
3087   /* We don't know much -- general case. */
3088   if (utf8_target) {
3089    for (;;) {
3090     if (regtry(reginfo, &s))
3091      goto got_it;
3092     if (s >= strend)
3093      break;
3094     s += UTF8SKIP(s);
3095    };
3096   }
3097   else {
3098    do {
3099     if (regtry(reginfo, &s))
3100      goto got_it;
3101    } while (s++ < strend);
3102   }
3103  }
3104
3105  /* Failure. */
3106  goto phooey;
3107
3108 got_it:
3109  /* s/// doesn't like it if $& is earlier than where we asked it to
3110  * start searching (which can happen on something like /.\G/) */
3111  if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3112    && (prog->offs[0].start < stringarg - strbeg))
3113  {
3114   /* this should only be possible under \G */
3115   assert(prog->intflags & PREGf_GPOS_SEEN);
3116   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3117    "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3118   goto phooey;
3119  }
3120
3121  DEBUG_BUFFERS_r(
3122   if (swap)
3123    PerlIO_printf(Perl_debug_log,
3124     "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3125     PTR2UV(prog),
3126     PTR2UV(swap)
3127    );
3128  );
3129  Safefree(swap);
3130
3131  /* clean up; this will trigger destructors that will free all slabs
3132  * above the current one, and cleanup the regmatch_info_aux
3133  * and regmatch_info_aux_eval sructs */
3134
3135  LEAVE_SCOPE(oldsave);
3136
3137  if (RXp_PAREN_NAMES(prog))
3138   (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3139
3140  RX_MATCH_UTF8_set(rx, utf8_target);
3141
3142  /* make sure $`, $&, $', and $digit will work later */
3143  if ( !(flags & REXEC_NOT_FIRST) )
3144   S_reg_set_capture_string(aTHX_ rx,
3145          strbeg, reginfo->strend,
3146          sv, flags, utf8_target);
3147
3148  return 1;
3149
3150 phooey:
3151  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3152       PL_colors[4], PL_colors[5]));
3153
3154  /* clean up; this will trigger destructors that will free all slabs
3155  * above the current one, and cleanup the regmatch_info_aux
3156  * and regmatch_info_aux_eval sructs */
3157
3158  LEAVE_SCOPE(oldsave);
3159
3160  if (swap) {
3161   /* we failed :-( roll it back */
3162   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3163    "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3164    PTR2UV(prog),
3165    PTR2UV(prog->offs),
3166    PTR2UV(swap)
3167   ));
3168   Safefree(prog->offs);
3169   prog->offs = swap;
3170  }
3171  return 0;
3172 }
3173
3174
3175 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3176  * Do inc before dec, in case old and new rex are the same */
3177 #define SET_reg_curpm(Re2)                          \
3178  if (reginfo->info_aux_eval) {                   \
3179   (void)ReREFCNT_inc(Re2);      \
3180   ReREFCNT_dec(PM_GETRE(PL_reg_curpm));     \
3181   PM_SETRE((PL_reg_curpm), (Re2));     \
3182  }
3183
3184
3185 /*
3186  - regtry - try match at specific point
3187  */
3188 STATIC I32   /* 0 failure, 1 success */
3189 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3190 {
3191  CHECKPOINT lastcp;
3192  REGEXP *const rx = reginfo->prog;
3193  regexp *const prog = ReANY(rx);
3194  SSize_t result;
3195  RXi_GET_DECL(prog,progi);
3196  GET_RE_DEBUG_FLAGS_DECL;
3197
3198  PERL_ARGS_ASSERT_REGTRY;
3199
3200  reginfo->cutpoint=NULL;
3201
3202  prog->offs[0].start = *startposp - reginfo->strbeg;
3203  prog->lastparen = 0;
3204  prog->lastcloseparen = 0;
3205
3206  /* XXXX What this code is doing here?!!!  There should be no need
3207  to do this again and again, prog->lastparen should take care of
3208  this!  --ilya*/
3209
3210  /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3211  * Actually, the code in regcppop() (which Ilya may be meaning by
3212  * prog->lastparen), is not needed at all by the test suite
3213  * (op/regexp, op/pat, op/split), but that code is needed otherwise
3214  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3215  * Meanwhile, this code *is* needed for the
3216  * above-mentioned test suite tests to succeed.  The common theme
3217  * on those tests seems to be returning null fields from matches.
3218  * --jhi updated by dapm */
3219 #if 1
3220  if (prog->nparens) {
3221   regexp_paren_pair *pp = prog->offs;
3222   I32 i;
3223   for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3224    ++pp;
3225    pp->start = -1;
3226    pp->end = -1;
3227   }
3228  }
3229 #endif
3230  REGCP_SET(lastcp);
3231  result = regmatch(reginfo, *startposp, progi->program + 1);
3232  if (result != -1) {
3233   prog->offs[0].end = result;
3234   return 1;
3235  }
3236  if (reginfo->cutpoint)
3237   *startposp= reginfo->cutpoint;
3238  REGCP_UNWIND(lastcp);
3239  return 0;
3240 }
3241
3242
3243 #define sayYES goto yes
3244 #define sayNO goto no
3245 #define sayNO_SILENT goto no_silent
3246
3247 /* we dont use STMT_START/END here because it leads to
3248    "unreachable code" warnings, which are bogus, but distracting. */
3249 #define CACHEsayNO \
3250  if (ST.cache_mask) \
3251  reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3252  sayNO
3253
3254 /* this is used to determine how far from the left messages like
3255    'failed...' are printed. It should be set such that messages
3256    are inline with the regop output that created them.
3257 */
3258 #define REPORT_CODE_OFF 32
3259
3260
3261 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3262 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3263 #define CHRTEST_NOT_A_CP_1 -999
3264 #define CHRTEST_NOT_A_CP_2 -998
3265
3266 /* grab a new slab and return the first slot in it */
3267
3268 STATIC regmatch_state *
3269 S_push_slab(pTHX)
3270 {
3271 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3272  dMY_CXT;
3273 #endif
3274  regmatch_slab *s = PL_regmatch_slab->next;
3275  if (!s) {
3276   Newx(s, 1, regmatch_slab);
3277   s->prev = PL_regmatch_slab;
3278   s->next = NULL;
3279   PL_regmatch_slab->next = s;
3280  }
3281  PL_regmatch_slab = s;
3282  return SLAB_FIRST(s);
3283 }
3284
3285
3286 /* push a new state then goto it */
3287
3288 #define PUSH_STATE_GOTO(state, node, input) \
3289  pushinput = input; \
3290  scan = node; \
3291  st->resume_state = state; \
3292  goto push_state;
3293
3294 /* push a new state with success backtracking, then goto it */
3295
3296 #define PUSH_YES_STATE_GOTO(state, node, input) \
3297  pushinput = input; \
3298  scan = node; \
3299  st->resume_state = state; \
3300  goto push_yes_state;
3301
3302
3303
3304
3305 /*
3306
3307 regmatch() - main matching routine
3308
3309 This is basically one big switch statement in a loop. We execute an op,
3310 set 'next' to point the next op, and continue. If we come to a point which
3311 we may need to backtrack to on failure such as (A|B|C), we push a
3312 backtrack state onto the backtrack stack. On failure, we pop the top
3313 state, and re-enter the loop at the state indicated. If there are no more
3314 states to pop, we return failure.
3315
3316 Sometimes we also need to backtrack on success; for example /A+/, where
3317 after successfully matching one A, we need to go back and try to
3318 match another one; similarly for lookahead assertions: if the assertion
3319 completes successfully, we backtrack to the state just before the assertion
3320 and then carry on.  In these cases, the pushed state is marked as
3321 'backtrack on success too'. This marking is in fact done by a chain of
3322 pointers, each pointing to the previous 'yes' state. On success, we pop to
3323 the nearest yes state, discarding any intermediate failure-only states.
3324 Sometimes a yes state is pushed just to force some cleanup code to be
3325 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3326 it to free the inner regex.
3327
3328 Note that failure backtracking rewinds the cursor position, while
3329 success backtracking leaves it alone.
3330
3331 A pattern is complete when the END op is executed, while a subpattern
3332 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3333 ops trigger the "pop to last yes state if any, otherwise return true"
3334 behaviour.
3335
3336 A common convention in this function is to use A and B to refer to the two
3337 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3338 the subpattern to be matched possibly multiple times, while B is the entire
3339 rest of the pattern. Variable and state names reflect this convention.
3340
3341 The states in the main switch are the union of ops and failure/success of
3342 substates associated with with that op.  For example, IFMATCH is the op
3343 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3344 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3345 successfully matched A and IFMATCH_A_fail is a state saying that we have
3346 just failed to match A. Resume states always come in pairs. The backtrack
3347 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3348 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3349 on success or failure.
3350
3351 The struct that holds a backtracking state is actually a big union, with
3352 one variant for each major type of op. The variable st points to the
3353 top-most backtrack struct. To make the code clearer, within each
3354 block of code we #define ST to alias the relevant union.
3355
3356 Here's a concrete example of a (vastly oversimplified) IFMATCH
3357 implementation:
3358
3359  switch (state) {
3360  ....
3361
3362 #define ST st->u.ifmatch
3363
3364  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3365   ST.foo = ...; // some state we wish to save
3366   ...
3367   // push a yes backtrack state with a resume value of
3368   // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3369   // first node of A:
3370   PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3371   // NOTREACHED
3372
3373  case IFMATCH_A: // we have successfully executed A; now continue with B
3374   next = B;
3375   bar = ST.foo; // do something with the preserved value
3376   break;
3377
3378  case IFMATCH_A_fail: // A failed, so the assertion failed
3379   ...;   // do some housekeeping, then ...
3380   sayNO; // propagate the failure
3381
3382 #undef ST
3383
3384  ...
3385  }
3386
3387 For any old-timers reading this who are familiar with the old recursive
3388 approach, the code above is equivalent to:
3389
3390  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3391  {
3392   int foo = ...
3393   ...
3394   if (regmatch(A)) {
3395    next = B;
3396    bar = foo;
3397    break;
3398   }
3399   ...;   // do some housekeeping, then ...
3400   sayNO; // propagate the failure
3401  }
3402
3403 The topmost backtrack state, pointed to by st, is usually free. If you
3404 want to claim it, populate any ST.foo fields in it with values you wish to
3405 save, then do one of
3406
3407   PUSH_STATE_GOTO(resume_state, node, newinput);
3408   PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3409
3410 which sets that backtrack state's resume value to 'resume_state', pushes a
3411 new free entry to the top of the backtrack stack, then goes to 'node'.
3412 On backtracking, the free slot is popped, and the saved state becomes the
3413 new free state. An ST.foo field in this new top state can be temporarily
3414 accessed to retrieve values, but once the main loop is re-entered, it
3415 becomes available for reuse.
3416
3417 Note that the depth of the backtrack stack constantly increases during the
3418 left-to-right execution of the pattern, rather than going up and down with
3419 the pattern nesting. For example the stack is at its maximum at Z at the
3420 end of the pattern, rather than at X in the following:
3421
3422  /(((X)+)+)+....(Y)+....Z/
3423
3424 The only exceptions to this are lookahead/behind assertions and the cut,
3425 (?>A), which pop all the backtrack states associated with A before
3426 continuing.
3427
3428 Backtrack state structs are allocated in slabs of about 4K in size.
3429 PL_regmatch_state and st always point to the currently active state,
3430 and PL_regmatch_slab points to the slab currently containing
3431 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3432 allocated, and is never freed until interpreter destruction. When the slab
3433 is full, a new one is allocated and chained to the end. At exit from
3434 regmatch(), slabs allocated since entry are freed.
3435
3436 */
3437
3438
3439 #define DEBUG_STATE_pp(pp)        \
3440  DEBUG_STATE_r({         \
3441   DUMP_EXEC_POS(locinput, scan, utf8_target);         \
3442   PerlIO_printf(Perl_debug_log,       \
3443    "    %*s"pp" %s%s%s%s%s\n",       \
3444    depth*2, "",        \
3445    PL_reg_name[st->resume_state],                  \
3446    ((st==yes_state||st==mark_state) ? "[" : ""),   \
3447    ((st==yes_state) ? "Y" : ""),                   \
3448    ((st==mark_state) ? "M" : ""),                  \
3449    ((st==yes_state||st==mark_state) ? "]" : "")    \
3450   );                                                  \
3451  });
3452
3453
3454 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3455
3456 #ifdef DEBUGGING
3457
3458 STATIC void
3459 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3460  const char *start, const char *end, const char *blurb)
3461 {
3462  const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3463
3464  PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3465
3466  if (!PL_colorset)
3467    reginitcolors();
3468  {
3469   RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3470    RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3471
3472   RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3473    start, end - start, 60);
3474
3475   PerlIO_printf(Perl_debug_log,
3476    "%s%s REx%s %s against %s\n",
3477      PL_colors[4], blurb, PL_colors[5], s0, s1);
3478
3479   if (utf8_target||utf8_pat)
3480    PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3481     utf8_pat ? "pattern" : "",
3482     utf8_pat && utf8_target ? " and " : "",
3483     utf8_target ? "string" : ""
3484    );
3485  }
3486 }
3487
3488 STATIC void
3489 S_dump_exec_pos(pTHX_ const char *locinput,
3490      const regnode *scan,
3491      const char *loc_regeol,
3492      const char *loc_bostr,
3493      const char *loc_reg_starttry,
3494      const bool utf8_target)
3495 {
3496  const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3497  const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3498  int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3499  /* The part of the string before starttry has one color
3500  (pref0_len chars), between starttry and current
3501  position another one (pref_len - pref0_len chars),
3502  after the current position the third one.
3503  We assume that pref0_len <= pref_len, otherwise we
3504  decrease pref0_len.  */
3505  int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3506   ? (5 + taill) - l : locinput - loc_bostr;
3507  int pref0_len;
3508
3509  PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3510
3511  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3512   pref_len++;
3513  pref0_len = pref_len  - (locinput - loc_reg_starttry);
3514  if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3515   l = ( loc_regeol - locinput > (5 + taill) - pref_len
3516    ? (5 + taill) - pref_len : loc_regeol - locinput);
3517  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3518   l--;
3519  if (pref0_len < 0)
3520   pref0_len = 0;
3521  if (pref0_len > pref_len)
3522   pref0_len = pref_len;
3523  {
3524   const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3525
3526   RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3527    (locinput - pref_len),pref0_len, 60, 4, 5);
3528
3529   RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3530      (locinput - pref_len + pref0_len),
3531      pref_len - pref0_len, 60, 2, 3);
3532
3533   RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3534      locinput, loc_regeol - locinput, 10, 0, 1);
3535
3536   const STRLEN tlen=len0+len1+len2;
3537   PerlIO_printf(Perl_debug_log,
3538      "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3539      (IV)(locinput - loc_bostr),
3540      len0, s0,
3541      len1, s1,
3542      (docolor ? "" : "> <"),
3543      len2, s2,
3544      (int)(tlen > 19 ? 0 :  19 - tlen),
3545      "");
3546  }
3547 }
3548
3549 #endif
3550
3551 /* reg_check_named_buff_matched()
3552  * Checks to see if a named buffer has matched. The data array of
3553  * buffer numbers corresponding to the buffer is expected to reside
3554  * in the regexp->data->data array in the slot stored in the ARG() of
3555  * node involved. Note that this routine doesn't actually care about the
3556  * name, that information is not preserved from compilation to execution.
3557  * Returns the index of the leftmost defined buffer with the given name
3558  * or 0 if non of the buffers matched.
3559  */
3560 STATIC I32
3561 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3562 {
3563  I32 n;
3564  RXi_GET_DECL(rex,rexi);
3565  SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3566  I32 *nums=(I32*)SvPVX(sv_dat);
3567
3568  PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3569
3570  for ( n=0; n<SvIVX(sv_dat); n++ ) {
3571   if ((I32)rex->lastparen >= nums[n] &&
3572    rex->offs[nums[n]].end != -1)
3573   {
3574    return nums[n];
3575   }
3576  }
3577  return 0;
3578 }
3579
3580
3581 static bool
3582 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3583   U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3584 {
3585  /* This function determines if there are one or two characters that match
3586  * the first character of the passed-in EXACTish node <text_node>, and if
3587  * so, returns them in the passed-in pointers.
3588  *
3589  * If it determines that no possible character in the target string can
3590  * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3591  * the first character in <text_node> requires UTF-8 to represent, and the
3592  * target string isn't in UTF-8.)
3593  *
3594  * If there are more than two characters that could match the beginning of
3595  * <text_node>, or if more context is required to determine a match or not,
3596  * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3597  *
3598  * The motiviation behind this function is to allow the caller to set up
3599  * tight loops for matching.  If <text_node> is of type EXACT, there is
3600  * only one possible character that can match its first character, and so
3601  * the situation is quite simple.  But things get much more complicated if
3602  * folding is involved.  It may be that the first character of an EXACTFish
3603  * node doesn't participate in any possible fold, e.g., punctuation, so it
3604  * can be matched only by itself.  The vast majority of characters that are
3605  * in folds match just two things, their lower and upper-case equivalents.
3606  * But not all are like that; some have multiple possible matches, or match
3607  * sequences of more than one character.  This function sorts all that out.
3608  *
3609  * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3610  * loop of trying to match A*, we know we can't exit where the thing
3611  * following it isn't a B.  And something can't be a B unless it is the
3612  * beginning of B.  By putting a quick test for that beginning in a tight
3613  * loop, we can rule out things that can't possibly be B without having to
3614  * break out of the loop, thus avoiding work.  Similarly, if A is a single
3615  * character, we can make a tight loop matching A*, using the outputs of
3616  * this function.
3617  *
3618  * If the target string to match isn't in UTF-8, and there aren't
3619  * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3620  * the one or two possible octets (which are characters in this situation)
3621  * that can match.  In all cases, if there is only one character that can
3622  * match, *<c1p> and *<c2p> will be identical.
3623  *
3624  * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3625  * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3626  * can match the beginning of <text_node>.  They should be declared with at
3627  * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3628  * undefined what these contain.)  If one or both of the buffers are
3629  * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3630  * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3631  * *<c2p> will be set to a negative number(s) that shouldn't match any code
3632  * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3633  * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3634
3635  const bool utf8_target = reginfo->is_utf8_target;
3636
3637  UV c1 = (UV)CHRTEST_NOT_A_CP_1;
3638  UV c2 = (UV)CHRTEST_NOT_A_CP_2;
3639  bool use_chrtest_void = FALSE;
3640  const bool is_utf8_pat = reginfo->is_utf8_pat;
3641
3642  /* Used when we have both utf8 input and utf8 output, to avoid converting
3643  * to/from code points */
3644  bool utf8_has_been_setup = FALSE;
3645
3646  dVAR;
3647
3648  U8 *pat = (U8*)STRING(text_node);
3649  U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3650
3651  if (OP(text_node) == EXACT) {
3652
3653   /* In an exact node, only one thing can be matched, that first
3654   * character.  If both the pat and the target are UTF-8, we can just
3655   * copy the input to the output, avoiding finding the code point of
3656   * that character */
3657   if (!is_utf8_pat) {
3658    c2 = c1 = *pat;
3659   }
3660   else if (utf8_target) {
3661    Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3662    Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3663    utf8_has_been_setup = TRUE;
3664   }
3665   else {
3666    c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3667   }
3668  }
3669  else { /* an EXACTFish node */
3670   U8 *pat_end = pat + STR_LEN(text_node);
3671
3672   /* An EXACTFL node has at least some characters unfolded, because what
3673   * they match is not known until now.  So, now is the time to fold
3674   * the first few of them, as many as are needed to determine 'c1' and
3675   * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
3676   * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3677   * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
3678   * need to fold as many characters as a single character can fold to,
3679   * so that later we can check if the first ones are such a multi-char
3680   * fold.  But, in such a pattern only locale-problematic characters
3681   * aren't folded, so we can skip this completely if the first character
3682   * in the node isn't one of the tricky ones */
3683   if (OP(text_node) == EXACTFL) {
3684
3685    if (! is_utf8_pat) {
3686     if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3687     {
3688      folded[0] = folded[1] = 's';
3689      pat = folded;
3690      pat_end = folded + 2;
3691     }
3692    }
3693    else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3694     U8 *s = pat;
3695     U8 *d = folded;
3696     int i;
3697
3698     for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3699      if (isASCII(*s)) {
3700       *(d++) = (U8) toFOLD_LC(*s);
3701       s++;
3702      }
3703      else {
3704       STRLEN len;
3705       _to_utf8_fold_flags(s,
3706            d,
3707            &len,
3708            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3709       d += len;
3710       s += UTF8SKIP(s);
3711      }
3712     }
3713
3714     pat = folded;
3715     pat_end = d;
3716    }
3717   }
3718
3719   if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
3720    || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
3721   {
3722    /* Multi-character folds require more context to sort out.  Also
3723    * PL_utf8_foldclosures used below doesn't handle them, so have to
3724    * be handled outside this routine */
3725    use_chrtest_void = TRUE;
3726   }
3727   else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3728    c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3729    if (c1 > 255) {
3730     /* Load the folds hash, if not already done */
3731     SV** listp;
3732     if (! PL_utf8_foldclosures) {
3733      _load_PL_utf8_foldclosures();
3734     }
3735
3736     /* The fold closures data structure is a hash with the keys
3737     * being the UTF-8 of every character that is folded to, like
3738     * 'k', and the values each an array of all code points that
3739     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
3740     * Multi-character folds are not included */
3741     if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3742           (char *) pat,
3743           UTF8SKIP(pat),
3744           FALSE))))
3745     {
3746      /* Not found in the hash, therefore there are no folds
3747      * containing it, so there is only a single character that
3748      * could match */
3749      c2 = c1;
3750     }
3751     else {  /* Does participate in folds */
3752      AV* list = (AV*) *listp;
3753      if (av_tindex(list) != 1) {
3754
3755       /* If there aren't exactly two folds to this, it is
3756       * outside the scope of this function */
3757       use_chrtest_void = TRUE;
3758      }
3759      else {  /* There are two.  Get them */
3760       SV** c_p = av_fetch(list, 0, FALSE);
3761       if (c_p == NULL) {
3762        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3763       }
3764       c1 = SvUV(*c_p);
3765
3766       c_p = av_fetch(list, 1, FALSE);
3767       if (c_p == NULL) {
3768        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3769       }
3770       c2 = SvUV(*c_p);
3771
3772       /* Folds that cross the 255/256 boundary are forbidden
3773       * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
3774       * one is ASCIII.  Since the pattern character is above
3775       * 255, and its only other match is below 256, the only
3776       * legal match will be to itself.  We have thrown away
3777       * the original, so have to compute which is the one
3778       * above 255. */
3779       if ((c1 < 256) != (c2 < 256)) {
3780        if ((OP(text_node) == EXACTFL
3781         && ! IN_UTF8_CTYPE_LOCALE)
3782         || ((OP(text_node) == EXACTFA
3783          || OP(text_node) == EXACTFA_NO_TRIE)
3784          && (isASCII(c1) || isASCII(c2))))
3785        {
3786         if (c1 < 256) {
3787          c1 = c2;
3788         }
3789         else {
3790          c2 = c1;
3791         }
3792        }
3793       }
3794      }
3795     }
3796    }
3797    else /* Here, c1 is <= 255 */
3798     if (utf8_target
3799      && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3800      && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
3801      && ((OP(text_node) != EXACTFA
3802       && OP(text_node) != EXACTFA_NO_TRIE)
3803       || ! isASCII(c1)))
3804    {
3805     /* Here, there could be something above Latin1 in the target
3806     * which folds to this character in the pattern.  All such
3807     * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
3808     * than two characters involved in their folds, so are outside
3809     * the scope of this function */
3810     if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3811      c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3812     }
3813     else {
3814      use_chrtest_void = TRUE;
3815     }
3816    }
3817    else { /* Here nothing above Latin1 can fold to the pattern
3818      character */
3819     switch (OP(text_node)) {
3820
3821      case EXACTFL:   /* /l rules */
3822       c2 = PL_fold_locale[c1];
3823       break;
3824
3825      case EXACTF:   /* This node only generated for non-utf8
3826          patterns */
3827       assert(! is_utf8_pat);
3828       if (! utf8_target) {    /* /d rules */
3829        c2 = PL_fold[c1];
3830        break;
3831       }
3832       /* FALLTHROUGH */
3833       /* /u rules for all these.  This happens to work for
3834       * EXACTFA as nothing in Latin1 folds to ASCII */
3835      case EXACTFA_NO_TRIE:   /* This node only generated for
3836            non-utf8 patterns */
3837       assert(! is_utf8_pat);
3838       /* FALLTHROUGH */
3839      case EXACTFA:
3840      case EXACTFU_SS:
3841      case EXACTFU:
3842       c2 = PL_fold_latin1[c1];
3843       break;
3844
3845      default:
3846       Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3847       assert(0); /* NOTREACHED */
3848     }
3849    }
3850   }
3851  }
3852
3853  /* Here have figured things out.  Set up the returns */
3854  if (use_chrtest_void) {
3855   *c2p = *c1p = CHRTEST_VOID;
3856  }
3857  else if (utf8_target) {
3858   if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3859    uvchr_to_utf8(c1_utf8, c1);
3860    uvchr_to_utf8(c2_utf8, c2);
3861   }
3862
3863   /* Invariants are stored in both the utf8 and byte outputs; Use
3864   * negative numbers otherwise for the byte ones.  Make sure that the
3865   * byte ones are the same iff the utf8 ones are the same */
3866   *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3867   *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3868     ? *c2_utf8
3869     : (c1 == c2)
3870     ? CHRTEST_NOT_A_CP_1
3871     : CHRTEST_NOT_A_CP_2;
3872  }
3873  else if (c1 > 255) {
3874  if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3875       can represent */
3876   return FALSE;
3877  }
3878
3879  *c1p = *c2p = c2;    /* c2 is the only representable value */
3880  }
3881  else {  /* c1 is representable; see about c2 */
3882  *c1p = c1;
3883  *c2p = (c2 < 256) ? c2 : c1;
3884  }
3885
3886  return TRUE;
3887 }
3888
3889 /* returns -1 on failure, $+[0] on success */
3890 STATIC SSize_t
3891 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3892 {
3893 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3894  dMY_CXT;
3895 #endif
3896  dVAR;
3897  const bool utf8_target = reginfo->is_utf8_target;
3898  const U32 uniflags = UTF8_ALLOW_DEFAULT;
3899  REGEXP *rex_sv = reginfo->prog;
3900  regexp *rex = ReANY(rex_sv);
3901  RXi_GET_DECL(rex,rexi);
3902  /* the current state. This is a cached copy of PL_regmatch_state */
3903  regmatch_state *st;
3904  /* cache heavy used fields of st in registers */
3905  regnode *scan;
3906  regnode *next;
3907  U32 n = 0; /* general value; init to avoid compiler warning */
3908  SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
3909  char *locinput = startpos;
3910  char *pushinput; /* where to continue after a PUSH */
3911  I32 nextchr;   /* is always set to UCHARAT(locinput) */
3912
3913  bool result = 0;     /* return value of S_regmatch */
3914  int depth = 0;     /* depth of backtrack stack */
3915  U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3916  const U32 max_nochange_depth =
3917   (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3918   3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3919  regmatch_state *yes_state = NULL; /* state to pop to on success of
3920                subpattern */
3921  /* mark_state piggy backs on the yes_state logic so that when we unwind
3922  the stack on success we can update the mark_state as we go */
3923  regmatch_state *mark_state = NULL; /* last mark state we have seen */
3924  regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3925  struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3926  U32 state_num;
3927  bool no_final = 0;      /* prevent failure from backtracking? */
3928  bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3929  char *startpoint = locinput;
3930  SV *popmark = NULL;     /* are we looking for a mark? */
3931  SV *sv_commit = NULL;   /* last mark name seen in failure */
3932  SV *sv_yes_mark = NULL; /* last mark name we have seen
3933        during a successful match */
3934  U32 lastopen = 0;       /* last open we saw */
3935  bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3936  SV* const oreplsv = GvSVn(PL_replgv);
3937  /* these three flags are set by various ops to signal information to
3938  * the very next op. They have a useful lifetime of exactly one loop
3939  * iteration, and are not preserved or restored by state pushes/pops
3940  */
3941  bool sw = 0;     /* the condition value in (?(cond)a|b) */
3942  bool minmod = 0;     /* the next "{n,m}" is a "{n,m}?" */
3943  int logical = 0;     /* the following EVAL is:
3944         0: (?{...})
3945         1: (?(?{...})X|Y)
3946         2: (??{...})
3947        or the following IFMATCH/UNLESSM is:
3948         false: plain (?=foo)
3949         true:  used as a condition: (?(?=foo))
3950        */
3951  PAD* last_pad = NULL;
3952  dMULTICALL;
3953  I32 gimme = G_SCALAR;
3954  CV *caller_cv = NULL; /* who called us */
3955  CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3956  CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3957  U32 maxopenparen = 0;       /* max '(' index seen so far */
3958  int to_complement;  /* Invert the result? */
3959  _char_class_number classnum;
3960  bool is_utf8_pat = reginfo->is_utf8_pat;
3961
3962 #ifdef DEBUGGING
3963  GET_RE_DEBUG_FLAGS_DECL;
3964 #endif
3965
3966  /* protect against undef(*^R) */
3967  SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
3968
3969  /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3970  multicall_oldcatch = 0;
3971  multicall_cv = NULL;
3972  cx = NULL;
3973  PERL_UNUSED_VAR(multicall_cop);
3974  PERL_UNUSED_VAR(newsp);
3975
3976
3977  PERL_ARGS_ASSERT_REGMATCH;
3978
3979  DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3980    PerlIO_printf(Perl_debug_log,"regmatch start\n");
3981  }));
3982
3983  st = PL_regmatch_state;
3984
3985  /* Note that nextchr is a byte even in UTF */
3986  SET_nextchr;
3987  scan = prog;
3988  while (scan != NULL) {
3989
3990   DEBUG_EXECUTE_r( {
3991    SV * const prop = sv_newmortal();
3992    regnode *rnext=regnext(scan);
3993    DUMP_EXEC_POS( locinput, scan, utf8_target );
3994    regprop(rex, prop, scan, reginfo);
3995
3996    PerlIO_printf(Perl_debug_log,
3997      "%3"IVdf":%*s%s(%"IVdf")\n",
3998      (IV)(scan - rexi->program), depth*2, "",
3999      SvPVX_const(prop),
4000      (PL_regkind[OP(scan)] == END || !rnext) ?
4001       0 : (IV)(rnext - rexi->program));
4002   });
4003
4004   next = scan + NEXT_OFF(scan);
4005   if (next == scan)
4006    next = NULL;
4007   state_num = OP(scan);
4008
4009   REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
4010  reenter_switch:
4011   to_complement = 0;
4012
4013   SET_nextchr;
4014   assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
4015
4016   switch (state_num) {
4017   case SBOL: /*  /^../ and /\A../  */
4018    if (locinput == reginfo->strbeg)
4019     break;
4020    sayNO;
4021
4022   case MBOL: /*  /^../m  */
4023    if (locinput == reginfo->strbeg ||
4024     (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
4025    {
4026     break;
4027    }
4028    sayNO;
4029
4030   case GPOS: /*  \G  */
4031    if (locinput == reginfo->ganch)
4032     break;
4033    sayNO;
4034
4035   case KEEPS: /*   \K  */
4036    /* update the startpoint */
4037    st->u.keeper.val = rex->offs[0].start;
4038    rex->offs[0].start = locinput - reginfo->strbeg;
4039    PUSH_STATE_GOTO(KEEPS_next, next, locinput);
4040    /* NOTREACHED */
4041    assert(0);
4042
4043   case KEEPS_next_fail:
4044    /* rollback the start point change */
4045    rex->offs[0].start = st->u.keeper.val;
4046    sayNO_SILENT;
4047    /* NOTREACHED */
4048    assert(0);
4049
4050   case MEOL: /* /..$/m  */
4051    if (!NEXTCHR_IS_EOS && nextchr != '\n')
4052     sayNO;
4053    break;
4054
4055   case SEOL: /* /..$/  */
4056    if (!NEXTCHR_IS_EOS && nextchr != '\n')
4057     sayNO;
4058    if (reginfo->strend - locinput > 1)
4059     sayNO;
4060    break;
4061
4062   case EOS: /*  \z  */
4063    if (!NEXTCHR_IS_EOS)
4064     sayNO;
4065    break;
4066
4067   case SANY: /*  /./s  */
4068    if (NEXTCHR_IS_EOS)
4069     sayNO;
4070    goto increment_locinput;
4071
4072   case CANY: /*  \C  */
4073    if (NEXTCHR_IS_EOS)
4074     sayNO;
4075    locinput++;
4076    break;
4077
4078   case REG_ANY: /*  /./  */
4079    if ((NEXTCHR_IS_EOS) || nextchr == '\n')
4080     sayNO;
4081    goto increment_locinput;
4082
4083
4084 #undef  ST
4085 #define ST st->u.trie
4086   case TRIEC: /* (ab|cd) with known charclass */
4087    /* In this case the charclass data is available inline so
4088    we can fail fast without a lot of extra overhead.
4089    */
4090    if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
4091     DEBUG_EXECUTE_r(
4092      PerlIO_printf(Perl_debug_log,
4093        "%*s  %sfailed to match trie start class...%s\n",
4094        REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4095     );
4096     sayNO_SILENT;
4097     /* NOTREACHED */
4098     assert(0);
4099    }
4100    /* FALLTHROUGH */
4101   case TRIE:  /* (ab|cd)  */
4102    /* the basic plan of execution of the trie is:
4103    * At the beginning, run though all the states, and
4104    * find the longest-matching word. Also remember the position
4105    * of the shortest matching word. For example, this pattern:
4106    *    1  2 3 4    5
4107    *    ab|a|x|abcd|abc
4108    * when matched against the string "abcde", will generate
4109    * accept states for all words except 3, with the longest
4110    * matching word being 4, and the shortest being 2 (with
4111    * the position being after char 1 of the string).
4112    *
4113    * Then for each matching word, in word order (i.e. 1,2,4,5),
4114    * we run the remainder of the pattern; on each try setting
4115    * the current position to the character following the word,
4116    * returning to try the next word on failure.
4117    *
4118    * We avoid having to build a list of words at runtime by
4119    * using a compile-time structure, wordinfo[].prev, which
4120    * gives, for each word, the previous accepting word (if any).
4121    * In the case above it would contain the mappings 1->2, 2->0,
4122    * 3->0, 4->5, 5->1.  We can use this table to generate, from
4123    * the longest word (4 above), a list of all words, by
4124    * following the list of prev pointers; this gives us the
4125    * unordered list 4,5,1,2. Then given the current word we have
4126    * just tried, we can go through the list and find the
4127    * next-biggest word to try (so if we just failed on word 2,
4128    * the next in the list is 4).
4129    *
4130    * Since at runtime we don't record the matching position in
4131    * the string for each word, we have to work that out for
4132    * each word we're about to process. The wordinfo table holds
4133    * the character length of each word; given that we recorded
4134    * at the start: the position of the shortest word and its
4135    * length in chars, we just need to move the pointer the
4136    * difference between the two char lengths. Depending on
4137    * Unicode status and folding, that's cheap or expensive.
4138    *
4139    * This algorithm is optimised for the case where are only a
4140    * small number of accept states, i.e. 0,1, or maybe 2.
4141    * With lots of accepts states, and having to try all of them,
4142    * it becomes quadratic on number of accept states to find all
4143    * the next words.
4144    */
4145
4146    {
4147     /* what type of TRIE am I? (utf8 makes this contextual) */
4148     DECL_TRIE_TYPE(scan);
4149
4150     /* what trie are we using right now */
4151     reg_trie_data * const trie
4152      = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
4153     HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
4154     U32 state = trie->startstate;
4155
4156     if (   trie->bitmap
4157      && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
4158     {
4159      if (trie->states[ state ].wordnum) {
4160       DEBUG_EXECUTE_r(
4161        PerlIO_printf(Perl_debug_log,
4162           "%*s  %smatched empty string...%s\n",
4163           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4164       );
4165       if (!trie->jump)
4166        break;
4167      } else {
4168       DEBUG_EXECUTE_r(
4169        PerlIO_printf(Perl_debug_log,
4170           "%*s  %sfailed to match trie start class...%s\n",
4171           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4172       );
4173       sayNO_SILENT;
4174     }
4175     }
4176
4177    {
4178     U8 *uc = ( U8* )locinput;
4179
4180     STRLEN len = 0;
4181     STRLEN foldlen = 0;
4182     U8 *uscan = (U8*)NULL;
4183     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
4184     U32 charcount = 0; /* how many input chars we have matched */
4185     U32 accepted = 0; /* have we seen any accepting states? */
4186
4187     ST.jump = trie->jump;
4188     ST.me = scan;
4189     ST.firstpos = NULL;
4190     ST.longfold = FALSE; /* char longer if folded => it's harder */
4191     ST.nextword = 0;
4192
4193     /* fully traverse the TRIE; note the position of the
4194     shortest accept state and the wordnum of the longest
4195     accept state */
4196
4197     while ( state && uc <= (U8*)(reginfo->strend) ) {
4198      U32 base = trie->states[ state ].trans.base;
4199      UV uvc = 0;
4200      U16 charid = 0;
4201      U16 wordnum;
4202      wordnum = trie->states[ state ].wordnum;
4203
4204      if (wordnum) { /* it's an accept state */
4205       if (!accepted) {
4206        accepted = 1;
4207        /* record first match position */
4208        if (ST.longfold) {
4209         ST.firstpos = (U8*)locinput;
4210         ST.firstchars = 0;
4211        }
4212        else {
4213         ST.firstpos = uc;
4214         ST.firstchars = charcount;
4215        }
4216       }
4217       if (!ST.nextword || wordnum < ST.nextword)
4218        ST.nextword = wordnum;
4219       ST.topword = wordnum;
4220      }
4221
4222      DEBUG_TRIE_EXECUTE_r({
4223         DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
4224         PerlIO_printf( Perl_debug_log,
4225          "%*s  %sState: %4"UVxf" Accepted: %c ",
4226          2+depth * 2, "", PL_colors[4],
4227          (UV)state, (accepted ? 'Y' : 'N'));
4228      });
4229
4230      /* read a char and goto next state */
4231      if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
4232       I32 offset;
4233       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
4234            uscan, len, uvc, charid, foldlen,
4235            foldbuf, uniflags);
4236       charcount++;
4237       if (foldlen>0)
4238        ST.longfold = TRUE;
4239       if (charid &&
4240        ( ((offset =
4241        base + charid - 1 - trie->uniquecharcount)) >= 0)
4242
4243        && ((U32)offset < trie->lasttrans)
4244        && trie->trans[offset].check == state)
4245       {
4246        state = trie->trans[offset].next;
4247       }
4248       else {
4249        state = 0;
4250       }
4251       uc += len;
4252
4253      }
4254      else {
4255       state = 0;
4256      }
4257      DEBUG_TRIE_EXECUTE_r(
4258       PerlIO_printf( Perl_debug_log,
4259        "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
4260        charid, uvc, (UV)state, PL_colors[5] );
4261      );
4262     }
4263     if (!accepted)
4264     sayNO;
4265
4266     /* calculate total number of accept states */
4267     {
4268      U16 w = ST.topword;
4269      accepted = 0;
4270      while (w) {
4271       w = trie->wordinfo[w].prev;
4272       accepted++;
4273      }
4274      ST.accepted = accepted;
4275     }
4276
4277     DEBUG_EXECUTE_r(
4278      PerlIO_printf( Perl_debug_log,
4279       "%*s  %sgot %"IVdf" possible matches%s\n",
4280       REPORT_CODE_OFF + depth * 2, "",
4281       PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4282     );
4283     goto trie_first_try; /* jump into the fail handler */
4284    }}
4285    /* NOTREACHED */
4286    assert(0);
4287
4288   case TRIE_next_fail: /* we failed - try next alternative */
4289   {
4290    U8 *uc;
4291    if ( ST.jump) {
4292     REGCP_UNWIND(ST.cp);
4293     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
4294    }
4295    if (!--ST.accepted) {
4296     DEBUG_EXECUTE_r({
4297      PerlIO_printf( Perl_debug_log,
4298       "%*s  %sTRIE failed...%s\n",
4299       REPORT_CODE_OFF+depth*2, "",
4300       PL_colors[4],
4301       PL_colors[5] );
4302     });
4303     sayNO_SILENT;
4304    }
4305    {
4306     /* Find next-highest word to process.  Note that this code
4307     * is O(N^2) per trie run (O(N) per branch), so keep tight */
4308     U16 min = 0;
4309     U16 word;
4310     U16 const nextword = ST.nextword;
4311     reg_trie_wordinfo * const wordinfo
4312      = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4313     for (word=ST.topword; word; word=wordinfo[word].prev) {
4314      if (word > nextword && (!min || word < min))
4315       min = word;
4316     }
4317     ST.nextword = min;
4318    }
4319
4320   trie_first_try:
4321    if (do_cutgroup) {
4322     do_cutgroup = 0;
4323     no_final = 0;
4324    }
4325
4326    if ( ST.jump) {
4327     ST.lastparen = rex->lastparen;
4328     ST.lastcloseparen = rex->lastcloseparen;
4329     REGCP_SET(ST.cp);
4330    }
4331
4332    /* find start char of end of current word */
4333    {
4334     U32 chars; /* how many chars to skip */
4335     reg_trie_data * const trie
4336      = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4337
4338     assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4339        >=  ST.firstchars);
4340     chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4341        - ST.firstchars;
4342     uc = ST.firstpos;
4343
4344     if (ST.longfold) {
4345      /* the hard option - fold each char in turn and find
4346      * its folded length (which may be different */
4347      U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4348      STRLEN foldlen;
4349      STRLEN len;
4350      UV uvc;
4351      U8 *uscan;
4352
4353      while (chars) {
4354       if (utf8_target) {
4355        uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
4356              uniflags);
4357        uc += len;
4358       }
4359       else {
4360        uvc = *uc;
4361        uc++;
4362       }
4363       uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4364       uscan = foldbuf;
4365       while (foldlen) {
4366        if (!--chars)
4367         break;
4368        uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
4369            uniflags);
4370        uscan += len;
4371        foldlen -= len;
4372       }
4373      }
4374     }
4375     else {
4376      if (utf8_target)
4377       while (chars--)
4378        uc += UTF8SKIP(uc);
4379      else
4380       uc += chars;
4381     }
4382    }
4383
4384    scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4385        ? ST.jump[ST.nextword]
4386        : NEXT_OFF(ST.me));
4387
4388    DEBUG_EXECUTE_r({
4389     PerlIO_printf( Perl_debug_log,
4390      "%*s  %sTRIE matched word #%d, continuing%s\n",
4391      REPORT_CODE_OFF+depth*2, "",
4392      PL_colors[4],
4393      ST.nextword,
4394      PL_colors[5]
4395      );
4396    });
4397
4398    if (ST.accepted > 1 || has_cutgroup) {
4399     PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4400     /* NOTREACHED */
4401     assert(0);
4402    }
4403    /* only one choice left - just continue */
4404    DEBUG_EXECUTE_r({
4405     AV *const trie_words
4406      = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4407     SV ** const tmp = av_fetch( trie_words,
4408      ST.nextword-1, 0 );
4409     SV *sv= tmp ? sv_newmortal() : NULL;
4410
4411     PerlIO_printf( Perl_debug_log,
4412      "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4413      REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4414      ST.nextword,
4415      tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4416        PL_colors[0], PL_colors[1],
4417        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4418       )
4419      : "not compiled under -Dr",
4420      PL_colors[5] );
4421    });
4422
4423    locinput = (char*)uc;
4424    continue; /* execute rest of RE */
4425    /* NOTREACHED */
4426    assert(0);
4427   }
4428 #undef  ST
4429
4430   case EXACT: {            /*  /abc/        */
4431    char *s = STRING(scan);
4432    ln = STR_LEN(scan);
4433    if (utf8_target != is_utf8_pat) {
4434     /* The target and the pattern have differing utf8ness. */
4435     char *l = locinput;
4436     const char * const e = s + ln;
4437
4438     if (utf8_target) {
4439      /* The target is utf8, the pattern is not utf8.
4440      * Above-Latin1 code points can't match the pattern;
4441      * invariants match exactly, and the other Latin1 ones need
4442      * to be downgraded to a single byte in order to do the
4443      * comparison.  (If we could be confident that the target
4444      * is not malformed, this could be refactored to have fewer
4445      * tests by just assuming that if the first bytes match, it
4446      * is an invariant, but there are tests in the test suite
4447      * dealing with (??{...}) which violate this) */
4448      while (s < e) {
4449       if (l >= reginfo->strend
4450        || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4451       {
4452        sayNO;
4453       }
4454       if (UTF8_IS_INVARIANT(*(U8*)l)) {
4455        if (*l != *s) {
4456         sayNO;
4457        }
4458        l++;
4459       }
4460       else {
4461        if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4462        {
4463         sayNO;
4464        }
4465        l += 2;
4466       }
4467       s++;
4468      }
4469     }
4470     else {
4471      /* The target is not utf8, the pattern is utf8. */
4472      while (s < e) {
4473       if (l >= reginfo->strend
4474        || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4475       {
4476        sayNO;
4477       }
4478       if (UTF8_IS_INVARIANT(*(U8*)s)) {
4479        if (*s != *l) {
4480         sayNO;
4481        }
4482        s++;
4483       }
4484       else {
4485        if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4486        {
4487         sayNO;
4488        }
4489        s += 2;
4490       }
4491       l++;
4492      }
4493     }
4494     locinput = l;
4495    }
4496    else {
4497     /* The target and the pattern have the same utf8ness. */
4498     /* Inline the first character, for speed. */
4499     if (reginfo->strend - locinput < ln
4500      || UCHARAT(s) != nextchr
4501      || (ln > 1 && memNE(s, locinput, ln)))
4502     {
4503      sayNO;
4504     }
4505     locinput += ln;
4506    }
4507    break;
4508    }
4509
4510   case EXACTFL: {          /*  /abc/il      */
4511    re_fold_t folder;
4512    const U8 * fold_array;
4513    const char * s;
4514    U32 fold_utf8_flags;
4515
4516    folder = foldEQ_locale;
4517    fold_array = PL_fold_locale;
4518    fold_utf8_flags = FOLDEQ_LOCALE;
4519    goto do_exactf;
4520
4521   case EXACTFU_SS:         /*  /\x{df}/iu   */
4522   case EXACTFU:            /*  /abc/iu      */
4523    folder = foldEQ_latin1;
4524    fold_array = PL_fold_latin1;
4525    fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4526    goto do_exactf;
4527
4528   case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
4529         patterns */
4530    assert(! is_utf8_pat);
4531    /* FALLTHROUGH */
4532   case EXACTFA:            /*  /abc/iaa     */
4533    folder = foldEQ_latin1;
4534    fold_array = PL_fold_latin1;
4535    fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4536    goto do_exactf;
4537
4538   case EXACTF:             /*  /abc/i    This node only generated for
4539            non-utf8 patterns */
4540    assert(! is_utf8_pat);
4541    folder = foldEQ;
4542    fold_array = PL_fold;
4543    fold_utf8_flags = 0;
4544
4545   do_exactf:
4546    s = STRING(scan);
4547    ln = STR_LEN(scan);
4548
4549    if (utf8_target
4550     || is_utf8_pat
4551     || state_num == EXACTFU_SS
4552     || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4553    {
4554    /* Either target or the pattern are utf8, or has the issue where
4555    * the fold lengths may differ. */
4556     const char * const l = locinput;
4557     char *e = reginfo->strend;
4558
4559     if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4560           l, &e, 0,  utf8_target, fold_utf8_flags))
4561     {
4562      sayNO;
4563     }
4564     locinput = e;
4565     break;
4566    }
4567
4568    /* Neither the target nor the pattern are utf8 */
4569    if (UCHARAT(s) != nextchr
4570     && !NEXTCHR_IS_EOS
4571     && UCHARAT(s) != fold_array[nextchr])
4572    {
4573     sayNO;
4574    }
4575    if (reginfo->strend - locinput < ln)
4576     sayNO;
4577    if (ln > 1 && ! folder(s, locinput, ln))
4578     sayNO;
4579    locinput += ln;
4580    break;
4581   }
4582
4583   /* XXX Could improve efficiency by separating these all out using a
4584   * macro or in-line function.  At that point regcomp.c would no longer
4585   * have to set the FLAGS fields of these */
4586   case BOUNDL:  /*  /\b/l  */
4587   case NBOUNDL: /*  /\B/l  */
4588   case BOUND:   /*  /\b/   */
4589   case BOUNDU:  /*  /\b/u  */
4590   case BOUNDA:  /*  /\b/a  */
4591   case NBOUND:  /*  /\B/   */
4592   case NBOUNDU: /*  /\B/u  */
4593   case NBOUNDA: /*  /\B/a  */
4594    /* was last char in word? */
4595    if (utf8_target
4596     && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4597     && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4598    {
4599     if (locinput == reginfo->strbeg)
4600      ln = '\n';
4601     else {
4602      const U8 * const r =
4603        reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4604
4605      ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
4606                 0, uniflags);
4607     }
4608     if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4609      ln = isWORDCHAR_uni(ln);
4610      if (NEXTCHR_IS_EOS)
4611       n = 0;
4612      else {
4613       LOAD_UTF8_CHARCLASS_ALNUM();
4614       n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4615                 utf8_target);
4616      }
4617     }
4618     else {
4619      ln = isWORDCHAR_LC_uvchr(ln);
4620      n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4621     }
4622    }
4623    else {
4624
4625     /* Here the string isn't utf8, or is utf8 and only ascii
4626     * characters are to match \w.  In the latter case looking at
4627     * the byte just prior to the current one may be just the final
4628     * byte of a multi-byte character.  This is ok.  There are two
4629     * cases:
4630     * 1) it is a single byte character, and then the test is doing
4631     * just what it's supposed to.
4632     * 2) it is a multi-byte character, in which case the final
4633     * byte is never mistakable for ASCII, and so the test
4634     * will say it is not a word character, which is the
4635     * correct answer. */
4636     ln = (locinput != reginfo->strbeg) ?
4637      UCHARAT(locinput - 1) : '\n';
4638     switch (FLAGS(scan)) {
4639      case REGEX_UNICODE_CHARSET:
4640       ln = isWORDCHAR_L1(ln);
4641       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4642       break;
4643      case REGEX_LOCALE_CHARSET:
4644       ln = isWORDCHAR_LC(ln);
4645       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4646       break;
4647      case REGEX_DEPENDS_CHARSET:
4648       ln = isWORDCHAR(ln);
4649       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4650       break;
4651      case REGEX_ASCII_RESTRICTED_CHARSET:
4652      case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4653       ln = isWORDCHAR_A(ln);
4654       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4655       break;
4656      default:
4657       Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4658     }
4659    }
4660    /* Note requires that all BOUNDs be lower than all NBOUNDs in
4661    * regcomp.sym */
4662    if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4663      sayNO;
4664    break;
4665
4666   case ANYOF:  /*  /[abc]/       */
4667    if (NEXTCHR_IS_EOS)
4668     sayNO;
4669    if (utf8_target) {
4670     if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
4671                 utf8_target))
4672      sayNO;
4673     locinput += UTF8SKIP(locinput);
4674    }
4675    else {
4676     if (!REGINCLASS(rex, scan, (U8*)locinput))
4677      sayNO;
4678     locinput++;
4679    }
4680    break;
4681
4682   /* The argument (FLAGS) to all the POSIX node types is the class number
4683   * */
4684
4685   case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4686    to_complement = 1;
4687    /* FALLTHROUGH */
4688
4689   case POSIXL:    /* \w or [:punct:] etc. under /l */
4690    if (NEXTCHR_IS_EOS)
4691     sayNO;
4692
4693    /* Use isFOO_lc() for characters within Latin1.  (Note that
4694    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4695    * wouldn't be invariant) */
4696    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4697     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4698      sayNO;
4699     }
4700    }
4701    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4702     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4703           (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4704                *(locinput + 1))))))
4705     {
4706      sayNO;
4707     }
4708    }
4709    else { /* Here, must be an above Latin-1 code point */
4710     goto utf8_posix_not_eos;
4711    }
4712
4713    /* Here, must be utf8 */
4714    locinput += UTF8SKIP(locinput);
4715    break;
4716
4717   case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4718    to_complement = 1;
4719    /* FALLTHROUGH */
4720
4721   case POSIXD:    /* \w or [:punct:] etc. under /d */
4722    if (utf8_target) {
4723     goto utf8_posix;
4724    }
4725    goto posixa;
4726
4727   case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4728
4729    if (NEXTCHR_IS_EOS) {
4730     sayNO;
4731    }
4732
4733    /* All UTF-8 variants match */
4734    if (! UTF8_IS_INVARIANT(nextchr)) {
4735     goto increment_locinput;
4736    }
4737
4738    to_complement = 1;
4739    /* FALLTHROUGH */
4740
4741   case POSIXA:    /* \w or [:punct:] etc. under /a */
4742
4743   posixa:
4744    /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4745    * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4746    * character is a single byte */
4747
4748    if (NEXTCHR_IS_EOS
4749     || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4750                FLAGS(scan)))))
4751    {
4752     sayNO;
4753    }
4754
4755    /* Here we are either not in utf8, or we matched a utf8-invariant,
4756    * so the next char is the next byte */
4757    locinput++;
4758    break;
4759
4760   case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4761    to_complement = 1;
4762    /* FALLTHROUGH */
4763
4764   case POSIXU:    /* \w or [:punct:] etc. under /u */
4765   utf8_posix:
4766    if (NEXTCHR_IS_EOS) {
4767     sayNO;
4768    }
4769   utf8_posix_not_eos:
4770
4771    /* Use _generic_isCC() for characters within Latin1.  (Note that
4772    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4773    * wouldn't be invariant) */
4774    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4775     if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4776               FLAGS(scan)))))
4777     {
4778      sayNO;
4779     }
4780     locinput++;
4781    }
4782    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4783     if (! (to_complement
4784      ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4785                *(locinput + 1)),
4786            FLAGS(scan)))))
4787     {
4788      sayNO;
4789     }
4790     locinput += 2;
4791    }
4792    else {  /* Handle above Latin-1 code points */
4793     classnum = (_char_class_number) FLAGS(scan);
4794     if (classnum < _FIRST_NON_SWASH_CC) {
4795
4796      /* Here, uses a swash to find such code points.  Load if if
4797      * not done already */
4798      if (! PL_utf8_swash_ptrs[classnum]) {
4799       U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4800       PL_utf8_swash_ptrs[classnum]
4801         = _core_swash_init("utf8",
4802           "",
4803           &PL_sv_undef, 1, 0,
4804           PL_XPosix_ptrs[classnum], &flags);
4805      }
4806      if (! (to_complement
4807       ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4808            (U8 *) locinput, TRUE))))
4809      {
4810       sayNO;
4811      }
4812     }
4813     else {  /* Here, uses macros to find above Latin-1 code points */
4814      switch (classnum) {
4815       case _CC_ENUM_SPACE:    /* XXX would require separate
4816             code if we revert the change
4817             of \v matching this */
4818       case _CC_ENUM_PSXSPC:
4819        if (! (to_complement
4820           ^ cBOOL(is_XPERLSPACE_high(locinput))))
4821        {
4822         sayNO;
4823        }
4824        break;
4825       case _CC_ENUM_BLANK:
4826        if (! (to_complement
4827            ^ cBOOL(is_HORIZWS_high(locinput))))
4828        {
4829         sayNO;
4830        }
4831        break;
4832       case _CC_ENUM_XDIGIT:
4833        if (! (to_complement
4834            ^ cBOOL(is_XDIGIT_high(locinput))))
4835        {
4836         sayNO;
4837        }
4838        break;
4839       case _CC_ENUM_VERTSPACE:
4840        if (! (to_complement
4841            ^ cBOOL(is_VERTWS_high(locinput))))
4842        {
4843         sayNO;
4844        }
4845        break;
4846       default:    /* The rest, e.g. [:cntrl:], can't match
4847          above Latin1 */
4848        if (! to_complement) {
4849         sayNO;
4850        }
4851        break;
4852      }
4853     }
4854     locinput += UTF8SKIP(locinput);
4855    }
4856    break;
4857
4858   case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4859      a Unicode extended Grapheme Cluster */
4860    /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4861    extended Grapheme Cluster is:
4862
4863    CR LF
4864    | Prepend* Begin Extend*
4865    | .
4866
4867    Begin is:           ( Special_Begin | ! Control )
4868    Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4869    Extend is:          ( Grapheme_Extend | Spacing_Mark )
4870    Control is:         [ GCB_Control | CR | LF ]
4871    Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4872
4873    If we create a 'Regular_Begin' = Begin - Special_Begin, then
4874    we can rewrite
4875
4876     Begin is ( Regular_Begin + Special Begin )
4877
4878    It turns out that 98.4% of all Unicode code points match
4879    Regular_Begin.  Doing it this way eliminates a table match in
4880    the previous implementation for almost all Unicode code points.
4881
4882    There is a subtlety with Prepend* which showed up in testing.
4883    Note that the Begin, and only the Begin is required in:
4884     | Prepend* Begin Extend*
4885    Also, Begin contains '! Control'.  A Prepend must be a
4886    '!  Control', which means it must also be a Begin.  What it
4887    comes down to is that if we match Prepend* and then find no
4888    suitable Begin afterwards, that if we backtrack the last
4889    Prepend, that one will be a suitable Begin.
4890    */
4891
4892    if (NEXTCHR_IS_EOS)
4893     sayNO;
4894    if  (! utf8_target) {
4895
4896     /* Match either CR LF  or '.', as all the other possibilities
4897     * require utf8 */
4898     locinput++;     /* Match the . or CR */
4899     if (nextchr == '\r' /* And if it was CR, and the next is LF,
4900          match the LF */
4901      && locinput < reginfo->strend
4902      && UCHARAT(locinput) == '\n')
4903     {
4904      locinput++;
4905     }
4906    }
4907    else {
4908
4909     /* Utf8: See if is ( CR LF ); already know that locinput <
4910     * reginfo->strend, so locinput+1 is in bounds */
4911     if ( nextchr == '\r' && locinput+1 < reginfo->strend
4912      && UCHARAT(locinput + 1) == '\n')
4913     {
4914      locinput += 2;
4915     }
4916     else {
4917      STRLEN len;
4918
4919      /* In case have to backtrack to beginning, then match '.' */
4920      char *starting = locinput;
4921
4922      /* In case have to backtrack the last prepend */
4923      char *previous_prepend = NULL;
4924
4925      LOAD_UTF8_CHARCLASS_GCB();
4926
4927      /* Match (prepend)*   */
4928      while (locinput < reginfo->strend
4929       && (len = is_GCB_Prepend_utf8(locinput)))
4930      {
4931       previous_prepend = locinput;
4932       locinput += len;
4933      }
4934
4935      /* As noted above, if we matched a prepend character, but
4936      * the next thing won't match, back off the last prepend we
4937      * matched, as it is guaranteed to match the begin */
4938      if (previous_prepend
4939       && (locinput >=  reginfo->strend
4940        || (! swash_fetch(PL_utf8_X_regular_begin,
4941            (U8*)locinput, utf8_target)
4942         && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4943       )
4944      {
4945       locinput = previous_prepend;
4946      }
4947
4948      /* Note that here we know reginfo->strend > locinput, as we
4949      * tested that upon input to this switch case, and if we
4950      * moved locinput forward, we tested the result just above
4951      * and it either passed, or we backed off so that it will
4952      * now pass */
4953      if (swash_fetch(PL_utf8_X_regular_begin,
4954          (U8*)locinput, utf8_target)) {
4955       locinput += UTF8SKIP(locinput);
4956      }
4957      else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4958
4959       /* Here did not match the required 'Begin' in the
4960       * second term.  So just match the very first
4961       * character, the '.' of the final term of the regex */
4962       locinput = starting + UTF8SKIP(starting);
4963       goto exit_utf8;
4964      } else {
4965
4966       /* Here is a special begin.  It can be composed of
4967       * several individual characters.  One possibility is
4968       * RI+ */
4969       if ((len = is_GCB_RI_utf8(locinput))) {
4970        locinput += len;
4971        while (locinput < reginfo->strend
4972         && (len = is_GCB_RI_utf8(locinput)))
4973        {
4974         locinput += len;
4975        }
4976       } else if ((len = is_GCB_T_utf8(locinput))) {
4977        /* Another possibility is T+ */
4978        locinput += len;
4979        while (locinput < reginfo->strend
4980         && (len = is_GCB_T_utf8(locinput)))
4981        {
4982         locinput += len;
4983        }
4984       } else {
4985
4986        /* Here, neither RI+ nor T+; must be some other
4987        * Hangul.  That means it is one of the others: L,
4988        * LV, LVT or V, and matches:
4989        * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4990
4991        /* Match L*           */
4992        while (locinput < reginfo->strend
4993         && (len = is_GCB_L_utf8(locinput)))
4994        {
4995         locinput += len;
4996        }
4997
4998        /* Here, have exhausted L*.  If the next character
4999        * is not an LV, LVT nor V, it means we had to have
5000        * at least one L, so matches L+ in the original
5001        * equation, we have a complete hangul syllable.
5002        * Are done. */
5003
5004        if (locinput < reginfo->strend
5005         && is_GCB_LV_LVT_V_utf8(locinput))
5006        {
5007         /* Otherwise keep going.  Must be LV, LVT or V.
5008         * See if LVT, by first ruling out V, then LV */
5009         if (! is_GCB_V_utf8(locinput)
5010           /* All but every TCount one is LV */
5011          && (valid_utf8_to_uvchr((U8 *) locinput,
5012                   NULL)
5013                   - SBASE)
5014           % TCount != 0)
5015         {
5016          locinput += UTF8SKIP(locinput);
5017         } else {
5018
5019          /* Must be  V or LV.  Take it, then match
5020          * V*     */
5021          locinput += UTF8SKIP(locinput);
5022          while (locinput < reginfo->strend
5023           && (len = is_GCB_V_utf8(locinput)))
5024          {
5025           locinput += len;
5026          }
5027         }
5028
5029         /* And any of LV, LVT, or V can be followed
5030         * by T*            */
5031         while (locinput < reginfo->strend
5032          && (len = is_GCB_T_utf8(locinput)))
5033         {
5034          locinput += len;
5035         }
5036        }
5037       }
5038      }
5039
5040      /* Match any extender */
5041      while (locinput < reginfo->strend
5042        && swash_fetch(PL_utf8_X_extend,
5043            (U8*)locinput, utf8_target))
5044      {
5045       locinput += UTF8SKIP(locinput);
5046      }
5047     }
5048    exit_utf8:
5049     if (locinput > reginfo->strend) sayNO;
5050    }
5051    break;
5052
5053   case NREFFL:  /*  /\g{name}/il  */
5054   {   /* The capture buffer cases.  The ones beginning with N for the
5055    named buffers just convert to the equivalent numbered and
5056    pretend they were called as the corresponding numbered buffer
5057    op.  */
5058    /* don't initialize these in the declaration, it makes C++
5059    unhappy */
5060    const char *s;
5061    char type;
5062    re_fold_t folder;
5063    const U8 *fold_array;
5064    UV utf8_fold_flags;
5065
5066    folder = foldEQ_locale;
5067    fold_array = PL_fold_locale;
5068    type = REFFL;
5069    utf8_fold_flags = FOLDEQ_LOCALE;
5070    goto do_nref;
5071
5072   case NREFFA:  /*  /\g{name}/iaa  */
5073    folder = foldEQ_latin1;
5074    fold_array = PL_fold_latin1;
5075    type = REFFA;
5076    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5077    goto do_nref;
5078
5079   case NREFFU:  /*  /\g{name}/iu  */
5080    folder = foldEQ_latin1;
5081    fold_array = PL_fold_latin1;
5082    type = REFFU;
5083    utf8_fold_flags = 0;
5084    goto do_nref;
5085
5086   case NREFF:  /*  /\g{name}/i  */
5087    folder = foldEQ;
5088    fold_array = PL_fold;
5089    type = REFF;
5090    utf8_fold_flags = 0;
5091    goto do_nref;
5092
5093   case NREF:  /*  /\g{name}/   */
5094    type = REF;
5095    folder = NULL;
5096    fold_array = NULL;
5097    utf8_fold_flags = 0;
5098   do_nref:
5099
5100    /* For the named back references, find the corresponding buffer
5101    * number */
5102    n = reg_check_named_buff_matched(rex,scan);
5103
5104    if ( ! n ) {
5105     sayNO;
5106    }
5107    goto do_nref_ref_common;
5108
5109   case REFFL:  /*  /\1/il  */
5110    folder = foldEQ_locale;
5111    fold_array = PL_fold_locale;
5112    utf8_fold_flags = FOLDEQ_LOCALE;
5113    goto do_ref;
5114
5115   case REFFA:  /*  /\1/iaa  */
5116    folder = foldEQ_latin1;
5117    fold_array = PL_fold_latin1;
5118    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5119    goto do_ref;
5120
5121   case REFFU:  /*  /\1/iu  */
5122    folder = foldEQ_latin1;
5123    fold_array = PL_fold_latin1;
5124    utf8_fold_flags = 0;
5125    goto do_ref;
5126
5127   case REFF:  /*  /\1/i  */
5128    folder = foldEQ;
5129    fold_array = PL_fold;
5130    utf8_fold_flags = 0;
5131    goto do_ref;
5132
5133   case REF:  /*  /\1/    */
5134    folder = NULL;
5135    fold_array = NULL;
5136    utf8_fold_flags = 0;
5137
5138   do_ref:
5139    type = OP(scan);
5140    n = ARG(scan);  /* which paren pair */
5141
5142   do_nref_ref_common:
5143    ln = rex->offs[n].start;
5144    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5145    if (rex->lastparen < n || ln == -1)
5146     sayNO;   /* Do not match unless seen CLOSEn. */
5147    if (ln == rex->offs[n].end)
5148     break;
5149
5150    s = reginfo->strbeg + ln;
5151    if (type != REF /* REF can do byte comparison */
5152     && (utf8_target || type == REFFU || type == REFFL))
5153    {
5154     char * limit = reginfo->strend;
5155
5156     /* This call case insensitively compares the entire buffer
5157      * at s, with the current input starting at locinput, but
5158      * not going off the end given by reginfo->strend, and
5159      * returns in <limit> upon success, how much of the
5160      * current input was matched */
5161     if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
5162          locinput, &limit, 0, utf8_target, utf8_fold_flags))
5163     {
5164      sayNO;
5165     }
5166     locinput = limit;
5167     break;
5168    }
5169
5170    /* Not utf8:  Inline the first character, for speed. */
5171    if (!NEXTCHR_IS_EOS &&
5172     UCHARAT(s) != nextchr &&
5173     (type == REF ||
5174     UCHARAT(s) != fold_array[nextchr]))
5175     sayNO;
5176    ln = rex->offs[n].end - ln;
5177    if (locinput + ln > reginfo->strend)
5178     sayNO;
5179    if (ln > 1 && (type == REF
5180       ? memNE(s, locinput, ln)
5181       : ! folder(s, locinput, ln)))
5182     sayNO;
5183    locinput += ln;
5184    break;
5185   }
5186
5187   case NOTHING: /* null op; e.g. the 'nothing' following
5188      * the '*' in m{(a+|b)*}' */
5189    break;
5190   case TAIL: /* placeholder while compiling (A|B|C) */
5191    break;
5192
5193   case BACK: /* ??? doesn't appear to be used ??? */
5194    break;
5195
5196 #undef  ST
5197 #define ST st->u.eval
5198   {
5199    SV *ret;
5200    REGEXP *re_sv;
5201    regexp *re;
5202    regexp_internal *rei;
5203    regnode *startpoint;
5204
5205   case GOSTART: /*  (?R)  */
5206   case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
5207    if (cur_eval && cur_eval->locinput==locinput) {
5208     if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
5209      Perl_croak(aTHX_ "Infinite recursion in regex");
5210     if ( ++nochange_depth > max_nochange_depth )
5211      Perl_croak(aTHX_
5212       "Pattern subroutine nesting without pos change"
5213       " exceeded limit in regex");
5214    } else {
5215     nochange_depth = 0;
5216    }
5217    re_sv = rex_sv;
5218    re = rex;
5219    rei = rexi;
5220    if (OP(scan)==GOSUB) {
5221     startpoint = scan + ARG2L(scan);
5222     ST.close_paren = ARG(scan);
5223    } else {
5224     startpoint = rei->program+1;
5225     ST.close_paren = 0;
5226    }
5227
5228    /* Save all the positions seen so far. */
5229    ST.cp = regcppush(rex, 0, maxopenparen);
5230    REGCP_SET(ST.lastcp);
5231
5232    /* and then jump to the code we share with EVAL */
5233    goto eval_recurse_doit;
5234
5235    /* NOTREACHED */
5236    assert(0);
5237
5238   case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
5239    if (cur_eval && cur_eval->locinput==locinput) {
5240     if ( ++nochange_depth > max_nochange_depth )
5241      Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
5242    } else {
5243     nochange_depth = 0;
5244    }
5245    {
5246     /* execute the code in the {...} */
5247
5248     dSP;
5249     IV before;
5250     OP * const oop = PL_op;
5251     COP * const ocurcop = PL_curcop;
5252     OP *nop;
5253     CV *newcv;
5254
5255     /* save *all* paren positions */
5256     regcppush(rex, 0, maxopenparen);
5257     REGCP_SET(runops_cp);
5258
5259     if (!caller_cv)
5260      caller_cv = find_runcv(NULL);
5261
5262     n = ARG(scan);
5263
5264     if (rexi->data->what[n] == 'r') { /* code from an external qr */
5265      newcv = (ReANY(
5266             (REGEXP*)(rexi->data->data[n])
5267            ))->qr_anoncv
5268           ;
5269      nop = (OP*)rexi->data->data[n+1];
5270     }
5271     else if (rexi->data->what[n] == 'l') { /* literal code */
5272      newcv = caller_cv;
5273      nop = (OP*)rexi->data->data[n];
5274      assert(CvDEPTH(newcv));
5275     }
5276     else {
5277      /* literal with own CV */
5278      assert(rexi->data->what[n] == 'L');
5279      newcv = rex->qr_anoncv;
5280      nop = (OP*)rexi->data->data[n];
5281     }
5282
5283     /* normally if we're about to execute code from the same
5284     * CV that we used previously, we just use the existing
5285     * CX stack entry. However, its possible that in the
5286     * meantime we may have backtracked, popped from the save
5287     * stack, and undone the SAVECOMPPAD(s) associated with
5288     * PUSH_MULTICALL; in which case PL_comppad no longer
5289     * points to newcv's pad. */
5290     if (newcv != last_pushed_cv || PL_comppad != last_pad)
5291     {
5292      U8 flags = (CXp_SUB_RE |
5293         ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
5294      if (last_pushed_cv) {
5295       CHANGE_MULTICALL_FLAGS(newcv, flags);
5296      }
5297      else {
5298       PUSH_MULTICALL_FLAGS(newcv, flags);
5299      }
5300      last_pushed_cv = newcv;
5301     }
5302     else {
5303      /* these assignments are just to silence compiler
5304      * warnings */
5305      multicall_cop = NULL;
5306      newsp = NULL;
5307     }
5308     last_pad = PL_comppad;
5309
5310     /* the initial nextstate you would normally execute
5311     * at the start of an eval (which would cause error
5312     * messages to come from the eval), may be optimised
5313     * away from the execution path in the regex code blocks;
5314     * so manually set PL_curcop to it initially */
5315     {
5316      OP *o = cUNOPx(nop)->op_first;
5317      assert(o->op_type == OP_NULL);
5318      if (o->op_targ == OP_SCOPE) {
5319       o = cUNOPo->op_first;
5320      }
5321      else {
5322       assert(o->op_targ == OP_LEAVE);
5323       o = cUNOPo->op_first;
5324       assert(o->op_type == OP_ENTER);
5325       o = OP_SIBLING(o);
5326      }
5327
5328      if (o->op_type != OP_STUB) {
5329       assert(    o->op_type == OP_NEXTSTATE
5330         || o->op_type == OP_DBSTATE
5331         || (o->op_type == OP_NULL
5332          &&  (  o->op_targ == OP_NEXTSTATE
5333           || o->op_targ == OP_DBSTATE
5334           )
5335          )
5336       );
5337       PL_curcop = (COP*)o;
5338      }
5339     }
5340     nop = nop->op_next;
5341
5342     DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
5343      "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5344
5345     rex->offs[0].end = locinput - reginfo->strbeg;
5346     if (reginfo->info_aux_eval->pos_magic)
5347      MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5348         reginfo->sv, reginfo->strbeg,
5349         locinput - reginfo->strbeg);
5350
5351     if (sv_yes_mark) {
5352      SV *sv_mrk = get_sv("REGMARK", 1);
5353      sv_setsv(sv_mrk, sv_yes_mark);
5354     }
5355
5356     /* we don't use MULTICALL here as we want to call the
5357     * first op of the block of interest, rather than the
5358     * first op of the sub */
5359     before = (IV)(SP-PL_stack_base);
5360     PL_op = nop;
5361     CALLRUNOPS(aTHX);   /* Scalar context. */
5362     SPAGAIN;
5363     if ((IV)(SP-PL_stack_base) == before)
5364      ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
5365     else {
5366      ret = POPs;
5367      PUTBACK;
5368     }
5369
5370     /* before restoring everything, evaluate the returned
5371     * value, so that 'uninit' warnings don't use the wrong
5372     * PL_op or pad. Also need to process any magic vars
5373     * (e.g. $1) *before* parentheses are restored */
5374
5375     PL_op = NULL;
5376
5377     re_sv = NULL;
5378     if (logical == 0)        /*   (?{})/   */
5379      sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5380     else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
5381      sw = cBOOL(SvTRUE(ret));
5382      logical = 0;
5383     }
5384     else {                   /*  /(??{})  */
5385      /*  if its overloaded, let the regex compiler handle
5386      *  it; otherwise extract regex, or stringify  */
5387      if (SvGMAGICAL(ret))
5388       ret = sv_mortalcopy(ret);
5389      if (!SvAMAGIC(ret)) {
5390       SV *sv = ret;
5391       if (SvROK(sv))
5392        sv = SvRV(sv);
5393       if (SvTYPE(sv) == SVt_REGEXP)
5394        re_sv = (REGEXP*) sv;
5395       else if (SvSMAGICAL(ret)) {
5396        MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
5397        if (mg)
5398         re_sv = (REGEXP *) mg->mg_obj;
5399       }
5400
5401       /* force any undef warnings here */
5402       if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
5403        ret = sv_mortalcopy(ret);
5404        (void) SvPV_force_nolen(ret);
5405       }
5406      }
5407
5408     }
5409
5410     /* *** Note that at this point we don't restore
5411     * PL_comppad, (or pop the CxSUB) on the assumption it may
5412     * be used again soon. This is safe as long as nothing
5413     * in the regexp code uses the pad ! */
5414     PL_op = oop;
5415     PL_curcop = ocurcop;
5416     S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5417     PL_curpm = PL_reg_curpm;
5418
5419     if (logical != 2)
5420      break;
5421    }
5422
5423     /* only /(??{})/  from now on */
5424     logical = 0;
5425     {
5426      /* extract RE object from returned value; compiling if
5427      * necessary */
5428
5429      if (re_sv) {
5430       re_sv = reg_temp_copy(NULL, re_sv);
5431      }
5432      else {
5433       U32 pm_flags = 0;
5434
5435       if (SvUTF8(ret) && IN_BYTES) {
5436        /* In use 'bytes': make a copy of the octet
5437        * sequence, but without the flag on */
5438        STRLEN len;
5439        const char *const p = SvPV(ret, len);
5440        ret = newSVpvn_flags(p, len, SVs_TEMP);
5441       }
5442       if (rex->intflags & PREGf_USE_RE_EVAL)
5443        pm_flags |= PMf_USE_RE_EVAL;
5444
5445       /* if we got here, it should be an engine which
5446       * supports compiling code blocks and stuff */
5447       assert(rex->engine && rex->engine->op_comp);
5448       assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5449       re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5450          rex->engine, NULL, NULL,
5451          /* copy /msix etc to inner pattern */
5452          scan->flags,
5453          pm_flags);
5454
5455       if (!(SvFLAGS(ret)
5456        & (SVs_TEMP | SVs_GMG | SVf_ROK))
5457       && (!SvPADTMP(ret) || SvREADONLY(ret))) {
5458        /* This isn't a first class regexp. Instead, it's
5459        caching a regexp onto an existing, Perl visible
5460        scalar.  */
5461        sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5462       }
5463      }
5464      SAVEFREESV(re_sv);
5465      re = ReANY(re_sv);
5466     }
5467     RXp_MATCH_COPIED_off(re);
5468     re->subbeg = rex->subbeg;
5469     re->sublen = rex->sublen;
5470     re->suboffset = rex->suboffset;
5471     re->subcoffset = rex->subcoffset;
5472     re->lastparen = 0;
5473     re->lastcloseparen = 0;
5474     rei = RXi_GET(re);
5475     DEBUG_EXECUTE_r(
5476      debug_start_match(re_sv, utf8_target, locinput,
5477          reginfo->strend, "Matching embedded");
5478     );
5479     startpoint = rei->program + 1;
5480      ST.close_paren = 0; /* only used for GOSUB */
5481     /* Save all the seen positions so far. */
5482     ST.cp = regcppush(rex, 0, maxopenparen);
5483     REGCP_SET(ST.lastcp);
5484     /* and set maxopenparen to 0, since we are starting a "fresh" match */
5485     maxopenparen = 0;
5486     /* run the pattern returned from (??{...}) */
5487
5488   eval_recurse_doit: /* Share code with GOSUB below this line
5489        * At this point we expect the stack context to be
5490        * set up correctly */
5491
5492     /* invalidate the S-L poscache. We're now executing a
5493     * different set of WHILEM ops (and their associated
5494     * indexes) against the same string, so the bits in the
5495     * cache are meaningless. Setting maxiter to zero forces
5496     * the cache to be invalidated and zeroed before reuse.
5497     * XXX This is too dramatic a measure. Ideally we should
5498     * save the old cache and restore when running the outer
5499     * pattern again */
5500     reginfo->poscache_maxiter = 0;
5501
5502     /* the new regexp might have a different is_utf8_pat than we do */
5503     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5504
5505     ST.prev_rex = rex_sv;
5506     ST.prev_curlyx = cur_curlyx;
5507     rex_sv = re_sv;
5508     SET_reg_curpm(rex_sv);
5509     rex = re;
5510     rexi = rei;
5511     cur_curlyx = NULL;
5512     ST.B = next;
5513     ST.prev_eval = cur_eval;
5514     cur_eval = st;
5515     /* now continue from first node in postoned RE */
5516     PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5517     /* NOTREACHED */
5518     assert(0);
5519   }
5520
5521   case EVAL_AB: /* cleanup after a successful (??{A})B */
5522    /* note: this is called twice; first after popping B, then A */
5523    rex_sv = ST.prev_rex;
5524    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5525    SET_reg_curpm(rex_sv);
5526    rex = ReANY(rex_sv);
5527    rexi = RXi_GET(rex);
5528    {
5529     /* preserve $^R across LEAVE's. See Bug 121070. */
5530     SV *save_sv= GvSV(PL_replgv);
5531     SvREFCNT_inc(save_sv);
5532     regcpblow(ST.cp); /* LEAVE in disguise */
5533     sv_setsv(GvSV(PL_replgv), save_sv);
5534     SvREFCNT_dec(save_sv);
5535    }
5536    cur_eval = ST.prev_eval;
5537    cur_curlyx = ST.prev_curlyx;
5538
5539    /* Invalidate cache. See "invalidate" comment above. */
5540    reginfo->poscache_maxiter = 0;
5541    if ( nochange_depth )
5542     nochange_depth--;
5543    sayYES;
5544
5545
5546   case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5547    /* note: this is called twice; first after popping B, then A */
5548    rex_sv = ST.prev_rex;
5549    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5550    SET_reg_curpm(rex_sv);
5551    rex = ReANY(rex_sv);
5552    rexi = RXi_GET(rex);
5553
5554    REGCP_UNWIND(ST.lastcp);
5555    regcppop(rex, &maxopenparen);
5556    cur_eval = ST.prev_eval;
5557    cur_curlyx = ST.prev_curlyx;
5558    /* Invalidate cache. See "invalidate" comment above. */
5559    reginfo->poscache_maxiter = 0;
5560    if ( nochange_depth )
5561     nochange_depth--;
5562    sayNO_SILENT;
5563 #undef ST
5564
5565   case OPEN: /*  (  */
5566    n = ARG(scan);  /* which paren pair */
5567    rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5568    if (n > maxopenparen)
5569     maxopenparen = n;
5570    DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5571     "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5572     PTR2UV(rex),
5573     PTR2UV(rex->offs),
5574     (UV)n,
5575     (IV)rex->offs[n].start_tmp,
5576     (UV)maxopenparen
5577    ));
5578    lastopen = n;
5579    break;
5580
5581 /* XXX really need to log other places start/end are set too */
5582 #define CLOSE_CAPTURE \
5583  rex->offs[n].start = rex->offs[n].start_tmp; \
5584  rex->offs[n].end = locinput - reginfo->strbeg; \
5585  DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5586   "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5587   PTR2UV(rex), \
5588   PTR2UV(rex->offs), \
5589   (UV)n, \
5590   (IV)rex->offs[n].start, \
5591   (IV)rex->offs[n].end \
5592  ))
5593
5594   case CLOSE:  /*  )  */
5595    n = ARG(scan);  /* which paren pair */
5596    CLOSE_CAPTURE;
5597    if (n > rex->lastparen)
5598     rex->lastparen = n;
5599    rex->lastcloseparen = n;
5600    if (cur_eval && cur_eval->u.eval.close_paren == n) {
5601     goto fake_end;
5602    }
5603    break;
5604
5605   case ACCEPT:  /*  (*ACCEPT)  */
5606    if (ARG(scan)){
5607     regnode *cursor;
5608     for (cursor=scan;
5609      cursor && OP(cursor)!=END;
5610      cursor=regnext(cursor))
5611     {
5612      if ( OP(cursor)==CLOSE ){
5613       n = ARG(cursor);
5614       if ( n <= lastopen ) {
5615        CLOSE_CAPTURE;
5616        if (n > rex->lastparen)
5617         rex->lastparen = n;
5618        rex->lastcloseparen = n;
5619        if ( n == ARG(scan) || (cur_eval &&
5620         cur_eval->u.eval.close_paren == n))
5621         break;
5622       }
5623      }
5624     }
5625    }
5626    goto fake_end;
5627    /* NOTREACHED */
5628
5629   case GROUPP:  /*  (?(1))  */
5630    n = ARG(scan);  /* which paren pair */
5631    sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5632    break;
5633
5634   case NGROUPP:  /*  (?(<name>))  */
5635    /* reg_check_named_buff_matched returns 0 for no match */
5636    sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5637    break;
5638
5639   case INSUBP:   /*  (?(R))  */
5640    n = ARG(scan);
5641    sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5642    break;
5643
5644   case DEFINEP:  /*  (?(DEFINE))  */
5645    sw = 0;
5646    break;
5647
5648   case IFTHEN:   /*  (?(cond)A|B)  */
5649    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5650    if (sw)
5651     next = NEXTOPER(NEXTOPER(scan));
5652    else {
5653     next = scan + ARG(scan);
5654     if (OP(next) == IFTHEN) /* Fake one. */
5655      next = NEXTOPER(NEXTOPER(next));
5656    }
5657    break;
5658
5659   case LOGICAL:  /* modifier for EVAL and IFMATCH */
5660    logical = scan->flags;
5661    break;
5662
5663 /*******************************************************************
5664
5665 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5666 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5667 STAR/PLUS/CURLY/CURLYN are used instead.)
5668
5669 A*B is compiled as <CURLYX><A><WHILEM><B>
5670
5671 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5672 state, which contains the current count, initialised to -1. It also sets
5673 cur_curlyx to point to this state, with any previous value saved in the
5674 state block.
5675
5676 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5677 since the pattern may possibly match zero times (i.e. it's a while {} loop
5678 rather than a do {} while loop).
5679
5680 Each entry to WHILEM represents a successful match of A. The count in the
5681 CURLYX block is incremented, another WHILEM state is pushed, and execution
5682 passes to A or B depending on greediness and the current count.
5683
5684 For example, if matching against the string a1a2a3b (where the aN are
5685 substrings that match /A/), then the match progresses as follows: (the
5686 pushed states are interspersed with the bits of strings matched so far):
5687
5688  <CURLYX cnt=-1>
5689  <CURLYX cnt=0><WHILEM>
5690  <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5691  <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5692  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5693  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5694
5695 (Contrast this with something like CURLYM, which maintains only a single
5696 backtrack state:
5697
5698  <CURLYM cnt=0> a1
5699  a1 <CURLYM cnt=1> a2
5700  a1 a2 <CURLYM cnt=2> a3
5701  a1 a2 a3 <CURLYM cnt=3> b
5702 )
5703
5704 Each WHILEM state block marks a point to backtrack to upon partial failure
5705 of A or B, and also contains some minor state data related to that
5706 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5707 overall state, such as the count, and pointers to the A and B ops.
5708
5709 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5710 must always point to the *current* CURLYX block, the rules are:
5711
5712 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5713 and set cur_curlyx to point the new block.
5714
5715 When popping the CURLYX block after a successful or unsuccessful match,
5716 restore the previous cur_curlyx.
5717
5718 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5719 to the outer one saved in the CURLYX block.
5720
5721 When popping the WHILEM block after a successful or unsuccessful B match,
5722 restore the previous cur_curlyx.
5723
5724 Here's an example for the pattern (AI* BI)*BO
5725 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5726
5727 cur_
5728 curlyx backtrack stack
5729 ------ ---------------
5730 NULL
5731 CO     <CO prev=NULL> <WO>
5732 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5733 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5734 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5735
5736 At this point the pattern succeeds, and we work back down the stack to
5737 clean up, restoring as we go:
5738
5739 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5740 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5741 CO     <CO prev=NULL> <WO>
5742 NULL
5743
5744 *******************************************************************/
5745
5746 #define ST st->u.curlyx
5747
5748   case CURLYX:    /* start of /A*B/  (for complex A) */
5749   {
5750    /* No need to save/restore up to this paren */
5751    I32 parenfloor = scan->flags;
5752
5753    assert(next); /* keep Coverity happy */
5754    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5755     next += ARG(next);
5756
5757    /* XXXX Probably it is better to teach regpush to support
5758    parenfloor > maxopenparen ... */
5759    if (parenfloor > (I32)rex->lastparen)
5760     parenfloor = rex->lastparen; /* Pessimization... */
5761
5762    ST.prev_curlyx= cur_curlyx;
5763    cur_curlyx = st;
5764    ST.cp = PL_savestack_ix;
5765
5766    /* these fields contain the state of the current curly.
5767    * they are accessed by subsequent WHILEMs */
5768    ST.parenfloor = parenfloor;
5769    ST.me = scan;
5770    ST.B = next;
5771    ST.minmod = minmod;
5772    minmod = 0;
5773    ST.count = -1; /* this will be updated by WHILEM */
5774    ST.lastloc = NULL;  /* this will be updated by WHILEM */
5775
5776    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5777    /* NOTREACHED */
5778    assert(0);
5779   }
5780
5781   case CURLYX_end: /* just finished matching all of A*B */
5782    cur_curlyx = ST.prev_curlyx;
5783    sayYES;
5784    /* NOTREACHED */
5785    assert(0);
5786
5787   case CURLYX_end_fail: /* just failed to match all of A*B */
5788    regcpblow(ST.cp);
5789    cur_curlyx = ST.prev_curlyx;
5790    sayNO;
5791    /* NOTREACHED */
5792    assert(0);
5793
5794
5795 #undef ST
5796 #define ST st->u.whilem
5797
5798   case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5799   {
5800    /* see the discussion above about CURLYX/WHILEM */
5801    I32 n;
5802    int min, max;
5803    regnode *A;
5804
5805    assert(cur_curlyx); /* keep Coverity happy */
5806
5807    min = ARG1(cur_curlyx->u.curlyx.me);
5808    max = ARG2(cur_curlyx->u.curlyx.me);
5809    A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5810    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5811    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5812    ST.cache_offset = 0;
5813    ST.cache_mask = 0;
5814
5815
5816    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5817     "%*s  whilem: matched %ld out of %d..%d\n",
5818     REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5819    );
5820
5821    /* First just match a string of min A's. */
5822
5823    if (n < min) {
5824     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5825          maxopenparen);
5826     cur_curlyx->u.curlyx.lastloc = locinput;
5827     REGCP_SET(ST.lastcp);
5828
5829     PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5830     /* NOTREACHED */
5831     assert(0);
5832    }
5833
5834    /* If degenerate A matches "", assume A done. */
5835
5836    if (locinput == cur_curlyx->u.curlyx.lastloc) {
5837     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5838     "%*s  whilem: empty match detected, trying continuation...\n",
5839     REPORT_CODE_OFF+depth*2, "")
5840     );
5841     goto do_whilem_B_max;
5842    }
5843
5844    /* super-linear cache processing.
5845    *
5846    * The idea here is that for certain types of CURLYX/WHILEM -
5847    * principally those whose upper bound is infinity (and
5848    * excluding regexes that have things like \1 and other very
5849    * non-regular expresssiony things), then if a pattern like
5850    * /....A*.../ fails and we backtrack to the WHILEM, then we
5851    * make a note that this particular WHILEM op was at string
5852    * position 47 (say) when the rest of pattern failed. Then, if
5853    * we ever find ourselves back at that WHILEM, and at string
5854    * position 47 again, we can just fail immediately rather than
5855    * running the rest of the pattern again.
5856    *
5857    * This is very handy when patterns start to go
5858    * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5859    * with a combinatorial explosion of backtracking.
5860    *
5861    * The cache is implemented as a bit array, with one bit per
5862    * string byte position per WHILEM op (up to 16) - so its
5863    * between 0.25 and 2x the string size.
5864    *
5865    * To avoid allocating a poscache buffer every time, we do an
5866    * initially countdown; only after we have  executed a WHILEM
5867    * op (string-length x #WHILEMs) times do we allocate the
5868    * cache.
5869    *
5870    * The top 4 bits of scan->flags byte say how many different
5871    * relevant CURLLYX/WHILEM op pairs there are, while the
5872    * bottom 4-bits is the identifying index number of this
5873    * WHILEM.
5874    */
5875
5876    if (scan->flags) {
5877
5878     if (!reginfo->poscache_maxiter) {
5879      /* start the countdown: Postpone detection until we
5880      * know the match is not *that* much linear. */
5881      reginfo->poscache_maxiter
5882       =    (reginfo->strend - reginfo->strbeg + 1)
5883       * (scan->flags>>4);
5884      /* possible overflow for long strings and many CURLYX's */
5885      if (reginfo->poscache_maxiter < 0)
5886       reginfo->poscache_maxiter = I32_MAX;
5887      reginfo->poscache_iter = reginfo->poscache_maxiter;
5888     }
5889
5890     if (reginfo->poscache_iter-- == 0) {
5891      /* initialise cache */
5892      const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
5893      regmatch_info_aux *const aux = reginfo->info_aux;
5894      if (aux->poscache) {
5895       if ((SSize_t)reginfo->poscache_size < size) {
5896        Renew(aux->poscache, size, char);
5897        reginfo->poscache_size = size;
5898       }
5899       Zero(aux->poscache, size, char);
5900      }
5901      else {
5902       reginfo->poscache_size = size;
5903       Newxz(aux->poscache, size, char);
5904      }
5905      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5906  "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5907        PL_colors[4], PL_colors[5])
5908      );
5909     }
5910
5911     if (reginfo->poscache_iter < 0) {
5912      /* have we already failed at this position? */
5913      SSize_t offset, mask;
5914
5915      reginfo->poscache_iter = -1; /* stop eventual underflow */
5916      offset  = (scan->flags & 0xf) - 1
5917         +   (locinput - reginfo->strbeg)
5918         * (scan->flags>>4);
5919      mask    = 1 << (offset % 8);
5920      offset /= 8;
5921      if (reginfo->info_aux->poscache[offset] & mask) {
5922       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5923        "%*s  whilem: (cache) already tried at this position...\n",
5924        REPORT_CODE_OFF+depth*2, "")
5925       );
5926       sayNO; /* cache records failure */
5927      }
5928      ST.cache_offset = offset;
5929      ST.cache_mask   = mask;
5930     }
5931    }
5932
5933    /* Prefer B over A for minimal matching. */
5934
5935    if (cur_curlyx->u.curlyx.minmod) {
5936     ST.save_curlyx = cur_curlyx;
5937     cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5938     ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5939        maxopenparen);
5940     REGCP_SET(ST.lastcp);
5941     PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5942          locinput);
5943     /* NOTREACHED */
5944     assert(0);
5945    }
5946
5947    /* Prefer A over B for maximal matching. */
5948
5949    if (n < max) { /* More greed allowed? */
5950     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5951        maxopenparen);
5952     cur_curlyx->u.curlyx.lastloc = locinput;
5953     REGCP_SET(ST.lastcp);
5954     PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5955     /* NOTREACHED */
5956     assert(0);
5957    }
5958    goto do_whilem_B_max;
5959   }
5960   /* NOTREACHED */
5961   assert(0);
5962
5963   case WHILEM_B_min: /* just matched B in a minimal match */
5964   case WHILEM_B_max: /* just matched B in a maximal match */
5965    cur_curlyx = ST.save_curlyx;
5966    sayYES;
5967    /* NOTREACHED */
5968    assert(0);
5969
5970   case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5971    cur_curlyx = ST.save_curlyx;
5972    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5973    cur_curlyx->u.curlyx.count--;
5974    CACHEsayNO;
5975    /* NOTREACHED */
5976    assert(0);
5977
5978   case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5979    /* FALLTHROUGH */
5980   case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5981    REGCP_UNWIND(ST.lastcp);
5982    regcppop(rex, &maxopenparen);
5983    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5984    cur_curlyx->u.curlyx.count--;
5985    CACHEsayNO;
5986    /* NOTREACHED */
5987    assert(0);
5988
5989   case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5990    REGCP_UNWIND(ST.lastcp);
5991    regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5992    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5993     "%*s  whilem: failed, trying continuation...\n",
5994     REPORT_CODE_OFF+depth*2, "")
5995    );
5996   do_whilem_B_max:
5997    if (cur_curlyx->u.curlyx.count >= REG_INFTY
5998     && ckWARN(WARN_REGEXP)
5999     && !reginfo->warned)
6000    {
6001     reginfo->warned = TRUE;
6002     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6003      "Complex regular subexpression recursion limit (%d) "
6004      "exceeded",
6005      REG_INFTY - 1);
6006    }
6007
6008    /* now try B */
6009    ST.save_curlyx = cur_curlyx;
6010    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6011    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
6012         locinput);
6013    /* NOTREACHED */
6014    assert(0);
6015
6016   case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
6017    cur_curlyx = ST.save_curlyx;
6018    REGCP_UNWIND(ST.lastcp);
6019    regcppop(rex, &maxopenparen);
6020
6021    if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
6022     /* Maximum greed exceeded */
6023     if (cur_curlyx->u.curlyx.count >= REG_INFTY
6024      && ckWARN(WARN_REGEXP)
6025      && !reginfo->warned)
6026     {
6027      reginfo->warned = TRUE;
6028      Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6029       "Complex regular subexpression recursion "
6030       "limit (%d) exceeded",
6031       REG_INFTY - 1);
6032     }
6033     cur_curlyx->u.curlyx.count--;
6034     CACHEsayNO;
6035    }
6036
6037    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6038     "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6039    );
6040    /* Try grabbing another A and see if it helps. */
6041    cur_curlyx->u.curlyx.lastloc = locinput;
6042    ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6043        maxopenparen);
6044    REGCP_SET(ST.lastcp);
6045    PUSH_STATE_GOTO(WHILEM_A_min,
6046     /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6047     locinput);
6048    /* NOTREACHED */
6049    assert(0);
6050
6051 #undef  ST
6052 #define ST st->u.branch
6053
6054   case BRANCHJ:     /*  /(...|A|...)/ with long next pointer */
6055    next = scan + ARG(scan);
6056    if (next == scan)
6057     next = NULL;
6058    scan = NEXTOPER(scan);
6059    /* FALLTHROUGH */
6060
6061   case BRANCH:     /*  /(...|A|...)/ */
6062    scan = NEXTOPER(scan); /* scan now points to inner node */
6063    ST.lastparen = rex->lastparen;
6064    ST.lastcloseparen = rex->lastcloseparen;
6065    ST.next_branch = next;
6066    REGCP_SET(ST.cp);
6067
6068    /* Now go into the branch */
6069    if (has_cutgroup) {
6070     PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
6071    } else {
6072     PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
6073    }
6074    /* NOTREACHED */
6075    assert(0);
6076
6077   case CUTGROUP:  /*  /(*THEN)/  */
6078    sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
6079     MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6080    PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
6081    /* NOTREACHED */
6082    assert(0);
6083
6084   case CUTGROUP_next_fail:
6085    do_cutgroup = 1;
6086    no_final = 1;
6087    if (st->u.mark.mark_name)
6088     sv_commit = st->u.mark.mark_name;
6089    sayNO;
6090    /* NOTREACHED */
6091    assert(0);
6092
6093   case BRANCH_next:
6094    sayYES;
6095    /* NOTREACHED */
6096    assert(0);
6097
6098   case BRANCH_next_fail: /* that branch failed; try the next, if any */
6099    if (do_cutgroup) {
6100     do_cutgroup = 0;
6101     no_final = 0;
6102    }
6103    REGCP_UNWIND(ST.cp);
6104    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6105    scan = ST.next_branch;
6106    /* no more branches? */
6107    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
6108     DEBUG_EXECUTE_r({
6109      PerlIO_printf( Perl_debug_log,
6110       "%*s  %sBRANCH failed...%s\n",
6111       REPORT_CODE_OFF+depth*2, "",
6112       PL_colors[4],
6113       PL_colors[5] );
6114     });
6115     sayNO_SILENT;
6116    }
6117    continue; /* execute next BRANCH[J] op */
6118    /* NOTREACHED */
6119    assert(0);
6120
6121   case MINMOD: /* next op will be non-greedy, e.g. A*?  */
6122    minmod = 1;
6123    break;
6124
6125 #undef  ST
6126 #define ST st->u.curlym
6127
6128   case CURLYM: /* /A{m,n}B/ where A is fixed-length */
6129
6130    /* This is an optimisation of CURLYX that enables us to push
6131    * only a single backtracking state, no matter how many matches
6132    * there are in {m,n}. It relies on the pattern being constant
6133    * length, with no parens to influence future backrefs
6134    */
6135
6136    ST.me = scan;
6137    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6138
6139    ST.lastparen      = rex->lastparen;
6140    ST.lastcloseparen = rex->lastcloseparen;
6141
6142    /* if paren positive, emulate an OPEN/CLOSE around A */
6143    if (ST.me->flags) {
6144     U32 paren = ST.me->flags;
6145     if (paren > maxopenparen)
6146      maxopenparen = paren;
6147     scan += NEXT_OFF(scan); /* Skip former OPEN. */
6148    }
6149    ST.A = scan;
6150    ST.B = next;
6151    ST.alen = 0;
6152    ST.count = 0;
6153    ST.minmod = minmod;
6154    minmod = 0;
6155    ST.c1 = CHRTEST_UNINIT;
6156    REGCP_SET(ST.cp);
6157
6158    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
6159     goto curlym_do_B;
6160
6161   curlym_do_A: /* execute the A in /A{m,n}B/  */
6162    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
6163    /* NOTREACHED */
6164    assert(0);
6165
6166   case CURLYM_A: /* we've just matched an A */
6167    ST.count++;
6168    /* after first match, determine A's length: u.curlym.alen */
6169    if (ST.count == 1) {
6170     if (reginfo->is_utf8_target) {
6171      char *s = st->locinput;
6172      while (s < locinput) {
6173       ST.alen++;
6174       s += UTF8SKIP(s);
6175      }
6176     }
6177     else {
6178      ST.alen = locinput - st->locinput;
6179     }
6180     if (ST.alen == 0)
6181      ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
6182    }
6183    DEBUG_EXECUTE_r(
6184     PerlIO_printf(Perl_debug_log,
6185       "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
6186       (int)(REPORT_CODE_OFF+(depth*2)), "",
6187       (IV) ST.count, (IV)ST.alen)
6188    );
6189
6190    if (cur_eval && cur_eval->u.eval.close_paren &&
6191     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6192     goto fake_end;
6193
6194    {
6195     I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
6196     if ( max == REG_INFTY || ST.count < max )
6197      goto curlym_do_A; /* try to match another A */
6198    }
6199    goto curlym_do_B; /* try to match B */
6200
6201   case CURLYM_A_fail: /* just failed to match an A */
6202    REGCP_UNWIND(ST.cp);
6203
6204    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
6205     || (cur_eval && cur_eval->u.eval.close_paren &&
6206      cur_eval->u.eval.close_paren == (U32)ST.me->flags))
6207     sayNO;
6208
6209   curlym_do_B: /* execute the B in /A{m,n}B/  */
6210    if (ST.c1 == CHRTEST_UNINIT) {
6211     /* calculate c1 and c2 for possible match of 1st char
6212     * following curly */
6213     ST.c1 = ST.c2 = CHRTEST_VOID;
6214     assert(ST.B);
6215     if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
6216      regnode *text_node = ST.B;
6217      if (! HAS_TEXT(text_node))
6218       FIND_NEXT_IMPT(text_node);
6219      /* this used to be
6220
6221       (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
6222
6223        But the former is redundant in light of the latter.
6224
6225        if this changes back then the macro for
6226        IS_TEXT and friends need to change.
6227      */
6228      if (PL_regkind[OP(text_node)] == EXACT) {
6229       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6230       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6231       reginfo))
6232       {
6233        sayNO;
6234       }
6235      }
6236     }
6237    }
6238
6239    DEBUG_EXECUTE_r(
6240     PerlIO_printf(Perl_debug_log,
6241      "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
6242      (int)(REPORT_CODE_OFF+(depth*2)),
6243      "", (IV)ST.count)
6244     );
6245    if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
6246     if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
6247      if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6248       && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6249      {
6250       /* simulate B failing */
6251       DEBUG_OPTIMISE_r(
6252        PerlIO_printf(Perl_debug_log,
6253         "%*s  CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
6254         (int)(REPORT_CODE_OFF+(depth*2)),"",
6255         valid_utf8_to_uvchr((U8 *) locinput, NULL),
6256         valid_utf8_to_uvchr(ST.c1_utf8, NULL),
6257         valid_utf8_to_uvchr(ST.c2_utf8, NULL))
6258       );
6259       state_num = CURLYM_B_fail;
6260       goto reenter_switch;
6261      }
6262     }
6263     else if (nextchr != ST.c1 && nextchr != ST.c2) {
6264      /* simulate B failing */
6265      DEBUG_OPTIMISE_r(
6266       PerlIO_printf(Perl_debug_log,
6267        "%*s  CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
6268        (int)(REPORT_CODE_OFF+(depth*2)),"",
6269        (int) nextchr, ST.c1, ST.c2)
6270      );
6271      state_num = CURLYM_B_fail;
6272      goto reenter_switch;
6273     }
6274    }
6275
6276    if (ST.me->flags) {
6277     /* emulate CLOSE: mark current A as captured */
6278     I32 paren = ST.me->flags;
6279     if (ST.count) {
6280      rex->offs[paren].start
6281       = HOPc(locinput, -ST.alen) - reginfo->strbeg;
6282      rex->offs[paren].end = locinput - reginfo->strbeg;
6283      if ((U32)paren > rex->lastparen)
6284       rex->lastparen = paren;
6285      rex->lastcloseparen = paren;
6286     }
6287     else
6288      rex->offs[paren].end = -1;
6289     if (cur_eval && cur_eval->u.eval.close_paren &&
6290      cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6291     {
6292      if (ST.count)
6293       goto fake_end;
6294      else
6295       sayNO;
6296     }
6297    }
6298
6299    PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
6300    /* NOTREACHED */
6301    assert(0);
6302
6303   case CURLYM_B_fail: /* just failed to match a B */
6304    REGCP_UNWIND(ST.cp);
6305    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6306    if (ST.minmod) {
6307     I32 max = ARG2(ST.me);
6308     if (max != REG_INFTY && ST.count == max)
6309      sayNO;
6310     goto curlym_do_A; /* try to match a further A */
6311    }
6312    /* backtrack one A */
6313    if (ST.count == ARG1(ST.me) /* min */)
6314     sayNO;
6315    ST.count--;
6316    SET_locinput(HOPc(locinput, -ST.alen));
6317    goto curlym_do_B; /* try to match B */
6318
6319 #undef ST
6320 #define ST st->u.curly
6321
6322 #define CURLY_SETPAREN(paren, success) \
6323  if (paren) { \
6324   if (success) { \
6325    rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6326    rex->offs[paren].end = locinput - reginfo->strbeg; \
6327    if (paren > rex->lastparen) \
6328     rex->lastparen = paren; \
6329    rex->lastcloseparen = paren; \
6330   } \
6331   else { \
6332    rex->offs[paren].end = -1; \
6333    rex->lastparen      = ST.lastparen; \
6334    rex->lastcloseparen = ST.lastcloseparen; \
6335   } \
6336  }
6337
6338   case STAR:  /*  /A*B/ where A is width 1 char */
6339    ST.paren = 0;
6340    ST.min = 0;
6341    ST.max = REG_INFTY;
6342    scan = NEXTOPER(scan);
6343    goto repeat;
6344
6345   case PLUS:  /*  /A+B/ where A is width 1 char */
6346    ST.paren = 0;
6347    ST.min = 1;
6348    ST.max = REG_INFTY;
6349    scan = NEXTOPER(scan);
6350    goto repeat;
6351
6352   case CURLYN:  /*  /(A){m,n}B/ where A is width 1 char */
6353    ST.paren = scan->flags; /* Which paren to set */
6354    ST.lastparen      = rex->lastparen;
6355    ST.lastcloseparen = rex->lastcloseparen;
6356    if (ST.paren > maxopenparen)
6357     maxopenparen = ST.paren;
6358    ST.min = ARG1(scan);  /* min to match */
6359    ST.max = ARG2(scan);  /* max to match */
6360    if (cur_eval && cur_eval->u.eval.close_paren &&
6361     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6362     ST.min=1;
6363     ST.max=1;
6364    }
6365    scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6366    goto repeat;
6367
6368   case CURLY:  /*  /A{m,n}B/ where A is width 1 char */
6369    ST.paren = 0;
6370    ST.min = ARG1(scan);  /* min to match */
6371    ST.max = ARG2(scan);  /* max to match */
6372    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6373   repeat:
6374    /*
6375    * Lookahead to avoid useless match attempts
6376    * when we know what character comes next.
6377    *
6378    * Used to only do .*x and .*?x, but now it allows
6379    * for )'s, ('s and (?{ ... })'s to be in the way
6380    * of the quantifier and the EXACT-like node.  -- japhy
6381    */
6382
6383    assert(ST.min <= ST.max);
6384    if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6385     ST.c1 = ST.c2 = CHRTEST_VOID;
6386    }
6387    else {
6388     regnode *text_node = next;
6389
6390     if (! HAS_TEXT(text_node))
6391      FIND_NEXT_IMPT(text_node);
6392
6393     if (! HAS_TEXT(text_node))
6394      ST.c1 = ST.c2 = CHRTEST_VOID;
6395     else {
6396      if ( PL_regkind[OP(text_node)] != EXACT ) {
6397       ST.c1 = ST.c2 = CHRTEST_VOID;
6398      }
6399      else {
6400
6401      /*  Currently we only get here when
6402
6403       PL_rekind[OP(text_node)] == EXACT
6404
6405       if this changes back then the macro for IS_TEXT and
6406       friends need to change. */
6407       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6408       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6409       reginfo))
6410       {
6411        sayNO;
6412       }
6413      }
6414     }
6415    }
6416
6417    ST.A = scan;
6418    ST.B = next;
6419    if (minmod) {
6420     char *li = locinput;
6421     minmod = 0;
6422     if (ST.min &&
6423       regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6424        < ST.min)
6425      sayNO;
6426     SET_locinput(li);
6427     ST.count = ST.min;
6428     REGCP_SET(ST.cp);
6429     if (ST.c1 == CHRTEST_VOID)
6430      goto curly_try_B_min;
6431
6432     ST.oldloc = locinput;
6433
6434     /* set ST.maxpos to the furthest point along the
6435     * string that could possibly match */
6436     if  (ST.max == REG_INFTY) {
6437      ST.maxpos = reginfo->strend - 1;
6438      if (utf8_target)
6439       while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6440        ST.maxpos--;
6441     }
6442     else if (utf8_target) {
6443      int m = ST.max - ST.min;
6444      for (ST.maxpos = locinput;
6445       m >0 && ST.maxpos < reginfo->strend; m--)
6446       ST.maxpos += UTF8SKIP(ST.maxpos);
6447     }
6448     else {
6449      ST.maxpos = locinput + ST.max - ST.min;
6450      if (ST.maxpos >= reginfo->strend)
6451       ST.maxpos = reginfo->strend - 1;
6452     }
6453     goto curly_try_B_min_known;
6454
6455    }
6456    else {
6457     /* avoid taking address of locinput, so it can remain
6458     * a register var */
6459     char *li = locinput;
6460     ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6461     if (ST.count < ST.min)
6462      sayNO;
6463     SET_locinput(li);
6464     if ((ST.count > ST.min)
6465      && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6466     {
6467      /* A{m,n} must come at the end of the string, there's
6468      * no point in backing off ... */
6469      ST.min = ST.count;
6470      /* ...except that $ and \Z can match before *and* after
6471      newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6472      We may back off by one in this case. */
6473      if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6474       ST.min--;
6475     }
6476     REGCP_SET(ST.cp);
6477     goto curly_try_B_max;
6478    }
6479    /* NOTREACHED */
6480    assert(0);
6481
6482   case CURLY_B_min_known_fail:
6483    /* failed to find B in a non-greedy match where c1,c2 valid */
6484
6485    REGCP_UNWIND(ST.cp);
6486    if (ST.paren) {
6487     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6488    }
6489    /* Couldn't or didn't -- move forward. */
6490    ST.oldloc = locinput;
6491    if (utf8_target)
6492     locinput += UTF8SKIP(locinput);
6493    else
6494     locinput++;
6495    ST.count++;
6496   curly_try_B_min_known:
6497    /* find the next place where 'B' could work, then call B */
6498    {
6499     int n;
6500     if (utf8_target) {
6501      n = (ST.oldloc == locinput) ? 0 : 1;
6502      if (ST.c1 == ST.c2) {
6503       /* set n to utf8_distance(oldloc, locinput) */
6504       while (locinput <= ST.maxpos
6505        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6506       {
6507        locinput += UTF8SKIP(locinput);
6508        n++;
6509       }
6510      }
6511      else {
6512       /* set n to utf8_distance(oldloc, locinput) */
6513       while (locinput <= ST.maxpos
6514        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6515        && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6516       {
6517        locinput += UTF8SKIP(locinput);
6518        n++;
6519       }
6520      }
6521     }
6522     else {  /* Not utf8_target */
6523      if (ST.c1 == ST.c2) {
6524       while (locinput <= ST.maxpos &&
6525        UCHARAT(locinput) != ST.c1)
6526        locinput++;
6527      }
6528      else {
6529       while (locinput <= ST.maxpos
6530        && UCHARAT(locinput) != ST.c1
6531        && UCHARAT(locinput) != ST.c2)
6532        locinput++;
6533      }
6534      n = locinput - ST.oldloc;
6535     }
6536     if (locinput > ST.maxpos)
6537      sayNO;
6538     if (n) {
6539      /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6540      * at b; check that everything between oldloc and
6541      * locinput matches */
6542      char *li = ST.oldloc;
6543      ST.count += n;
6544      if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6545       sayNO;
6546      assert(n == REG_INFTY || locinput == li);
6547     }
6548     CURLY_SETPAREN(ST.paren, ST.count);
6549     if (cur_eval && cur_eval->u.eval.close_paren &&
6550      cur_eval->u.eval.close_paren == (U32)ST.paren) {
6551      goto fake_end;
6552     }
6553     PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6554    }
6555    /* NOTREACHED */
6556    assert(0);
6557
6558   case CURLY_B_min_fail:
6559    /* failed to find B in a non-greedy match where c1,c2 invalid */
6560
6561    REGCP_UNWIND(ST.cp);
6562    if (ST.paren) {
6563     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6564    }
6565    /* failed -- move forward one */
6566    {
6567     char *li = locinput;
6568     if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6569      sayNO;
6570     }
6571     locinput = li;
6572    }
6573    {
6574     ST.count++;
6575     if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6576       ST.count > 0)) /* count overflow ? */
6577     {
6578     curly_try_B_min:
6579      CURLY_SETPAREN(ST.paren, ST.count);
6580      if (cur_eval && cur_eval->u.eval.close_paren &&
6581       cur_eval->u.eval.close_paren == (U32)ST.paren) {
6582       goto fake_end;
6583      }
6584      PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6585     }
6586    }
6587    sayNO;
6588    /* NOTREACHED */
6589    assert(0);
6590
6591   curly_try_B_max:
6592    /* a successful greedy match: now try to match B */
6593    if (cur_eval && cur_eval->u.eval.close_paren &&
6594     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6595     goto fake_end;
6596    }
6597    {
6598     bool could_match = locinput < reginfo->strend;
6599
6600     /* If it could work, try it. */
6601     if (ST.c1 != CHRTEST_VOID && could_match) {
6602      if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6603      {
6604       could_match = memEQ(locinput,
6605            ST.c1_utf8,
6606            UTF8SKIP(locinput))
6607          || memEQ(locinput,
6608            ST.c2_utf8,
6609            UTF8SKIP(locinput));
6610      }
6611      else {
6612       could_match = UCHARAT(locinput) == ST.c1
6613          || UCHARAT(locinput) == ST.c2;
6614      }
6615     }
6616     if (ST.c1 == CHRTEST_VOID || could_match) {
6617      CURLY_SETPAREN(ST.paren, ST.count);
6618      PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6619      /* NOTREACHED */
6620      assert(0);
6621     }
6622    }
6623    /* FALLTHROUGH */
6624
6625   case CURLY_B_max_fail:
6626    /* failed to find B in a greedy match */
6627
6628    REGCP_UNWIND(ST.cp);
6629    if (ST.paren) {
6630     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6631    }
6632    /*  back up. */
6633    if (--ST.count < ST.min)
6634     sayNO;
6635    locinput = HOPc(locinput, -1);
6636    goto curly_try_B_max;
6637
6638 #undef ST
6639
6640   case END: /*  last op of main pattern  */
6641    fake_end:
6642    if (cur_eval) {
6643     /* we've just finished A in /(??{A})B/; now continue with B */
6644
6645     st->u.eval.prev_rex = rex_sv;  /* inner */
6646
6647     /* Save *all* the positions. */
6648     st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6649     rex_sv = cur_eval->u.eval.prev_rex;
6650     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6651     SET_reg_curpm(rex_sv);
6652     rex = ReANY(rex_sv);
6653     rexi = RXi_GET(rex);
6654     cur_curlyx = cur_eval->u.eval.prev_curlyx;
6655
6656     REGCP_SET(st->u.eval.lastcp);
6657
6658     /* Restore parens of the outer rex without popping the
6659     * savestack */
6660     S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6661           &maxopenparen);
6662
6663     st->u.eval.prev_eval = cur_eval;
6664     cur_eval = cur_eval->u.eval.prev_eval;
6665     DEBUG_EXECUTE_r(
6666      PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6667          REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6668     if ( nochange_depth )
6669      nochange_depth--;
6670
6671     PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6672          locinput); /* match B */
6673    }
6674
6675    if (locinput < reginfo->till) {
6676     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6677          "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6678          PL_colors[4],
6679          (long)(locinput - startpos),
6680          (long)(reginfo->till - startpos),
6681          PL_colors[5]));
6682
6683     sayNO_SILENT;  /* Cannot match: too short. */
6684    }
6685    sayYES;   /* Success! */
6686
6687   case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6688    DEBUG_EXECUTE_r(
6689    PerlIO_printf(Perl_debug_log,
6690     "%*s  %ssubpattern success...%s\n",
6691     REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6692    sayYES;   /* Success! */
6693
6694 #undef  ST
6695 #define ST st->u.ifmatch
6696
6697   {
6698    char *newstart;
6699
6700   case SUSPEND: /* (?>A) */
6701    ST.wanted = 1;
6702    newstart = locinput;
6703    goto do_ifmatch;
6704
6705   case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6706    ST.wanted = 0;
6707    goto ifmatch_trivial_fail_test;
6708
6709   case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6710    ST.wanted = 1;
6711   ifmatch_trivial_fail_test:
6712    if (scan->flags) {
6713     char * const s = HOPBACKc(locinput, scan->flags);
6714     if (!s) {
6715      /* trivial fail */
6716      if (logical) {
6717       logical = 0;
6718       sw = 1 - cBOOL(ST.wanted);
6719      }
6720      else if (ST.wanted)
6721       sayNO;
6722      next = scan + ARG(scan);
6723      if (next == scan)
6724       next = NULL;
6725      break;
6726     }
6727     newstart = s;
6728    }
6729    else
6730     newstart = locinput;
6731
6732   do_ifmatch:
6733    ST.me = scan;
6734    ST.logical = logical;
6735    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6736
6737    /* execute body of (?...A) */
6738    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6739    /* NOTREACHED */
6740    assert(0);
6741   }
6742
6743   case IFMATCH_A_fail: /* body of (?...A) failed */
6744    ST.wanted = !ST.wanted;
6745    /* FALLTHROUGH */
6746
6747   case IFMATCH_A: /* body of (?...A) succeeded */
6748    if (ST.logical) {
6749     sw = cBOOL(ST.wanted);
6750    }
6751    else if (!ST.wanted)
6752     sayNO;
6753
6754    if (OP(ST.me) != SUSPEND) {
6755     /* restore old position except for (?>...) */
6756     locinput = st->locinput;
6757    }
6758    scan = ST.me + ARG(ST.me);
6759    if (scan == ST.me)
6760     scan = NULL;
6761    continue; /* execute B */
6762
6763 #undef ST
6764
6765   case LONGJMP: /*  alternative with many branches compiles to
6766      * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6767    next = scan + ARG(scan);
6768    if (next == scan)
6769     next = NULL;
6770    break;
6771
6772   case COMMIT:  /*  (*COMMIT)  */
6773    reginfo->cutpoint = reginfo->strend;
6774    /* FALLTHROUGH */
6775
6776   case PRUNE:   /*  (*PRUNE)   */
6777    if (!scan->flags)
6778     sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6779    PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6780    /* NOTREACHED */
6781    assert(0);
6782
6783   case COMMIT_next_fail:
6784    no_final = 1;
6785    /* FALLTHROUGH */
6786
6787   case OPFAIL:   /* (*FAIL)  */
6788    sayNO;
6789    /* NOTREACHED */
6790    assert(0);
6791
6792 #define ST st->u.mark
6793   case MARKPOINT: /*  (*MARK:foo)  */
6794    ST.prev_mark = mark_state;
6795    ST.mark_name = sv_commit = sv_yes_mark
6796     = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6797    mark_state = st;
6798    ST.mark_loc = locinput;
6799    PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6800    /* NOTREACHED */
6801    assert(0);
6802
6803   case MARKPOINT_next:
6804    mark_state = ST.prev_mark;
6805    sayYES;
6806    /* NOTREACHED */
6807    assert(0);
6808
6809   case MARKPOINT_next_fail:
6810    if (popmark && sv_eq(ST.mark_name,popmark))
6811    {
6812     if (ST.mark_loc > startpoint)
6813      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6814     popmark = NULL; /* we found our mark */
6815     sv_commit = ST.mark_name;
6816
6817     DEBUG_EXECUTE_r({
6818       PerlIO_printf(Perl_debug_log,
6819        "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6820        REPORT_CODE_OFF+depth*2, "",
6821        PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6822     });
6823    }
6824    mark_state = ST.prev_mark;
6825    sv_yes_mark = mark_state ?
6826     mark_state->u.mark.mark_name : NULL;
6827    sayNO;
6828    /* NOTREACHED */
6829    assert(0);
6830
6831   case SKIP:  /*  (*SKIP)  */
6832    if (scan->flags) {
6833     /* (*SKIP) : if we fail we cut here*/
6834     ST.mark_name = NULL;
6835     ST.mark_loc = locinput;
6836     PUSH_STATE_GOTO(SKIP_next,next, locinput);
6837    } else {
6838     /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6839     otherwise do nothing.  Meaning we need to scan
6840     */
6841     regmatch_state *cur = mark_state;
6842     SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6843
6844     while (cur) {
6845      if ( sv_eq( cur->u.mark.mark_name,
6846         find ) )
6847      {
6848       ST.mark_name = find;
6849       PUSH_STATE_GOTO( SKIP_next, next, locinput);
6850      }
6851      cur = cur->u.mark.prev_mark;
6852     }
6853    }
6854    /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6855    break;
6856
6857   case SKIP_next_fail:
6858    if (ST.mark_name) {
6859     /* (*CUT:NAME) - Set up to search for the name as we
6860     collapse the stack*/
6861     popmark = ST.mark_name;
6862    } else {
6863     /* (*CUT) - No name, we cut here.*/
6864     if (ST.mark_loc > startpoint)
6865      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6866     /* but we set sv_commit to latest mark_name if there
6867     is one so they can test to see how things lead to this
6868     cut */
6869     if (mark_state)
6870      sv_commit=mark_state->u.mark.mark_name;
6871    }
6872    no_final = 1;
6873    sayNO;
6874    /* NOTREACHED */
6875    assert(0);
6876 #undef ST
6877
6878   case LNBREAK: /* \R */
6879    if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6880     locinput += n;
6881    } else
6882     sayNO;
6883    break;
6884
6885   default:
6886    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6887       PTR2UV(scan), OP(scan));
6888    Perl_croak(aTHX_ "regexp memory corruption");
6889
6890   /* this is a point to jump to in order to increment
6891   * locinput by one character */
6892   increment_locinput:
6893    assert(!NEXTCHR_IS_EOS);
6894    if (utf8_target) {
6895     locinput += PL_utf8skip[nextchr];
6896     /* locinput is allowed to go 1 char off the end, but not 2+ */
6897     if (locinput > reginfo->strend)
6898      sayNO;
6899    }
6900    else
6901     locinput++;
6902    break;
6903
6904   } /* end switch */
6905
6906   /* switch break jumps here */
6907   scan = next; /* prepare to execute the next op and ... */
6908   continue;    /* ... jump back to the top, reusing st */
6909   /* NOTREACHED */
6910   assert(0);
6911
6912  push_yes_state:
6913   /* push a state that backtracks on success */
6914   st->u.yes.prev_yes_state = yes_state;
6915   yes_state = st;
6916   /* FALLTHROUGH */
6917  push_state:
6918   /* push a new regex state, then continue at scan  */
6919   {
6920    regmatch_state *newst;
6921
6922    DEBUG_STACK_r({
6923     regmatch_state *cur = st;
6924     regmatch_state *curyes = yes_state;
6925     int curd = depth;
6926     regmatch_slab *slab = PL_regmatch_slab;
6927     for (;curd > -1;cur--,curd--) {
6928      if (cur < SLAB_FIRST(slab)) {
6929       slab = slab->prev;
6930       cur = SLAB_LAST(slab);
6931      }
6932      PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6933       REPORT_CODE_OFF + 2 + depth * 2,"",
6934       curd, PL_reg_name[cur->resume_state],
6935       (curyes == cur) ? "yes" : ""
6936      );
6937      if (curyes == cur)
6938       curyes = cur->u.yes.prev_yes_state;
6939     }
6940    } else
6941     DEBUG_STATE_pp("push")
6942    );
6943    depth++;
6944    st->locinput = locinput;
6945    newst = st+1;
6946    if (newst >  SLAB_LAST(PL_regmatch_slab))
6947     newst = S_push_slab(aTHX);
6948    PL_regmatch_state = newst;
6949
6950    locinput = pushinput;
6951    st = newst;
6952    continue;
6953    /* NOTREACHED */
6954    assert(0);
6955   }
6956  }
6957
6958  /*
6959  * We get here only if there's trouble -- normally "case END" is
6960  * the terminating point.
6961  */
6962  Perl_croak(aTHX_ "corrupted regexp pointers");
6963  /* NOTREACHED */
6964  sayNO;
6965
6966 yes:
6967  if (yes_state) {
6968   /* we have successfully completed a subexpression, but we must now
6969   * pop to the state marked by yes_state and continue from there */
6970   assert(st != yes_state);
6971 #ifdef DEBUGGING
6972   while (st != yes_state) {
6973    st--;
6974    if (st < SLAB_FIRST(PL_regmatch_slab)) {
6975     PL_regmatch_slab = PL_regmatch_slab->prev;
6976     st = SLAB_LAST(PL_regmatch_slab);
6977    }
6978    DEBUG_STATE_r({
6979     if (no_final) {
6980      DEBUG_STATE_pp("pop (no final)");
6981     } else {
6982      DEBUG_STATE_pp("pop (yes)");
6983     }
6984    });
6985    depth--;
6986   }
6987 #else
6988   while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6989    || yes_state > SLAB_LAST(PL_regmatch_slab))
6990   {
6991    /* not in this slab, pop slab */
6992    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6993    PL_regmatch_slab = PL_regmatch_slab->prev;
6994    st = SLAB_LAST(PL_regmatch_slab);
6995   }
6996   depth -= (st - yes_state);
6997 #endif
6998   st = yes_state;
6999   yes_state = st->u.yes.prev_yes_state;
7000   PL_regmatch_state = st;
7001
7002   if (no_final)
7003    locinput= st->locinput;
7004   state_num = st->resume_state + no_final;
7005   goto reenter_switch;
7006  }
7007
7008  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
7009       PL_colors[4], PL_colors[5]));
7010
7011  if (reginfo->info_aux_eval) {
7012   /* each successfully executed (?{...}) block does the equivalent of
7013   *   local $^R = do {...}
7014   * When popping the save stack, all these locals would be undone;
7015   * bypass this by setting the outermost saved $^R to the latest
7016   * value */
7017   /* I dont know if this is needed or works properly now.
7018   * see code related to PL_replgv elsewhere in this file.
7019   * Yves
7020   */
7021   if (oreplsv != GvSV(PL_replgv))
7022    sv_setsv(oreplsv, GvSV(PL_replgv));
7023  }
7024  result = 1;
7025  goto final_exit;
7026
7027 no:
7028  DEBUG_EXECUTE_r(
7029   PerlIO_printf(Perl_debug_log,
7030    "%*s  %sfailed...%s\n",
7031    REPORT_CODE_OFF+depth*2, "",
7032    PL_colors[4], PL_colors[5])
7033   );
7034
7035 no_silent:
7036  if (no_final) {
7037   if (yes_state) {
7038    goto yes;
7039   } else {
7040    goto final_exit;
7041   }
7042  }
7043  if (depth) {
7044   /* there's a previous state to backtrack to */
7045   st--;
7046   if (st < SLAB_FIRST(PL_regmatch_slab)) {
7047    PL_regmatch_slab = PL_regmatch_slab->prev;
7048    st = SLAB_LAST(PL_regmatch_slab);
7049   }
7050   PL_regmatch_state = st;
7051   locinput= st->locinput;
7052
7053   DEBUG_STATE_pp("pop");
7054   depth--;
7055   if (yes_state == st)
7056    yes_state = st->u.yes.prev_yes_state;
7057
7058   state_num = st->resume_state + 1; /* failure = success + 1 */
7059   goto reenter_switch;
7060  }
7061  result = 0;
7062
7063   final_exit:
7064  if (rex->intflags & PREGf_VERBARG_SEEN) {
7065   SV *sv_err = get_sv("REGERROR", 1);
7066   SV *sv_mrk = get_sv("REGMARK", 1);
7067   if (result) {
7068    sv_commit = &PL_sv_no;
7069    if (!sv_yes_mark)
7070     sv_yes_mark = &PL_sv_yes;
7071   } else {
7072    if (!sv_commit)
7073     sv_commit = &PL_sv_yes;
7074    sv_yes_mark = &PL_sv_no;
7075   }
7076   assert(sv_err);
7077   assert(sv_mrk);
7078   sv_setsv(sv_err, sv_commit);
7079   sv_setsv(sv_mrk, sv_yes_mark);
7080  }
7081
7082
7083  if (last_pushed_cv) {
7084   dSP;
7085   POP_MULTICALL;
7086   PERL_UNUSED_VAR(SP);
7087  }
7088
7089  assert(!result ||  locinput - reginfo->strbeg >= 0);
7090  return result ?  locinput - reginfo->strbeg : -1;
7091 }
7092
7093 /*
7094  - regrepeat - repeatedly match something simple, report how many
7095  *
7096  * What 'simple' means is a node which can be the operand of a quantifier like
7097  * '+', or {1,3}
7098  *
7099  * startposp - pointer a pointer to the start position.  This is updated
7100  *             to point to the byte following the highest successful
7101  *             match.
7102  * p         - the regnode to be repeatedly matched against.
7103  * reginfo   - struct holding match state, such as strend
7104  * max       - maximum number of things to match.
7105  * depth     - (for debugging) backtracking depth.
7106  */
7107 STATIC I32
7108 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
7109    regmatch_info *const reginfo, I32 max, int depth)
7110 {
7111  char *scan;     /* Pointer to current position in target string */
7112  I32 c;
7113  char *loceol = reginfo->strend;   /* local version */
7114  I32 hardcount = 0;  /* How many matches so far */
7115  bool utf8_target = reginfo->is_utf8_target;
7116  int to_complement = 0;  /* Invert the result? */
7117  UV utf8_flags;
7118  _char_class_number classnum;
7119 #ifndef DEBUGGING
7120  PERL_UNUSED_ARG(depth);
7121 #endif
7122
7123  PERL_ARGS_ASSERT_REGREPEAT;
7124
7125  scan = *startposp;
7126  if (max == REG_INFTY)
7127   max = I32_MAX;
7128  else if (! utf8_target && loceol - scan > max)
7129   loceol = scan + max;
7130
7131  /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
7132  * to the maximum of how far we should go in it (leaving it set to the real
7133  * end, if the maximum permissible would take us beyond that).  This allows
7134  * us to make the loop exit condition that we haven't gone past <loceol> to
7135  * also mean that we haven't exceeded the max permissible count, saving a
7136  * test each time through the loop.  But it assumes that the OP matches a
7137  * single byte, which is true for most of the OPs below when applied to a
7138  * non-UTF-8 target.  Those relatively few OPs that don't have this
7139  * characteristic will have to compensate.
7140  *
7141  * There is no adjustment for UTF-8 targets, as the number of bytes per
7142  * character varies.  OPs will have to test both that the count is less
7143  * than the max permissible (using <hardcount> to keep track), and that we
7144  * are still within the bounds of the string (using <loceol>.  A few OPs
7145  * match a single byte no matter what the encoding.  They can omit the max
7146  * test if, for the UTF-8 case, they do the adjustment that was skipped
7147  * above.
7148  *
7149  * Thus, the code above sets things up for the common case; and exceptional
7150  * cases need extra work; the common case is to make sure <scan> doesn't
7151  * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
7152  * count doesn't exceed the maximum permissible */
7153
7154  switch (OP(p)) {
7155  case REG_ANY:
7156   if (utf8_target) {
7157    while (scan < loceol && hardcount < max && *scan != '\n') {
7158     scan += UTF8SKIP(scan);
7159     hardcount++;
7160    }
7161   } else {
7162    while (scan < loceol && *scan != '\n')
7163     scan++;
7164   }
7165   break;
7166  case SANY:
7167   if (utf8_target) {
7168    while (scan < loceol && hardcount < max) {
7169     scan += UTF8SKIP(scan);
7170     hardcount++;
7171    }
7172   }
7173   else
7174    scan = loceol;
7175   break;
7176  case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
7177   if (utf8_target && loceol - scan > max) {
7178
7179    /* <loceol> hadn't been adjusted in the UTF-8 case */
7180    scan +=  max;
7181   }
7182   else {
7183    scan = loceol;
7184   }
7185   break;
7186  case EXACT:
7187   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7188
7189   c = (U8)*STRING(p);
7190
7191   /* Can use a simple loop if the pattern char to match on is invariant
7192   * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
7193   * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
7194   * true iff it doesn't matter if the argument is in UTF-8 or not */
7195   if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
7196    if (utf8_target && loceol - scan > max) {
7197     /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7198     * since here, to match at all, 1 char == 1 byte */
7199     loceol = scan + max;
7200    }
7201    while (scan < loceol && UCHARAT(scan) == c) {
7202     scan++;
7203    }
7204   }
7205   else if (reginfo->is_utf8_pat) {
7206    if (utf8_target) {
7207     STRLEN scan_char_len;
7208
7209     /* When both target and pattern are UTF-8, we have to do
7210     * string EQ */
7211     while (hardcount < max
7212      && scan < loceol
7213      && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
7214      && memEQ(scan, STRING(p), scan_char_len))
7215     {
7216      scan += scan_char_len;
7217      hardcount++;
7218     }
7219    }
7220    else if (! UTF8_IS_ABOVE_LATIN1(c)) {
7221
7222     /* Target isn't utf8; convert the character in the UTF-8
7223     * pattern to non-UTF8, and do a simple loop */
7224     c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
7225     while (scan < loceol && UCHARAT(scan) == c) {
7226      scan++;
7227     }
7228    } /* else pattern char is above Latin1, can't possibly match the
7229     non-UTF-8 target */
7230   }
7231   else {
7232
7233    /* Here, the string must be utf8; pattern isn't, and <c> is
7234    * different in utf8 than not, so can't compare them directly.
7235    * Outside the loop, find the two utf8 bytes that represent c, and
7236    * then look for those in sequence in the utf8 string */
7237    U8 high = UTF8_TWO_BYTE_HI(c);
7238    U8 low = UTF8_TWO_BYTE_LO(c);
7239
7240    while (hardcount < max
7241      && scan + 1 < loceol
7242      && UCHARAT(scan) == high
7243      && UCHARAT(scan + 1) == low)
7244    {
7245     scan += 2;
7246     hardcount++;
7247    }
7248   }
7249   break;
7250
7251  case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
7252   assert(! reginfo->is_utf8_pat);
7253   /* FALLTHROUGH */
7254  case EXACTFA:
7255   utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7256   goto do_exactf;
7257
7258  case EXACTFL:
7259   utf8_flags = FOLDEQ_LOCALE;
7260   goto do_exactf;
7261
7262  case EXACTF:   /* This node only generated for non-utf8 patterns */
7263   assert(! reginfo->is_utf8_pat);
7264   utf8_flags = 0;
7265   goto do_exactf;
7266
7267  case EXACTFU_SS:
7268  case EXACTFU:
7269   utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
7270
7271  do_exactf: {
7272   int c1, c2;
7273   U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
7274
7275   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7276
7277   if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
7278           reginfo))
7279   {
7280    if (c1 == CHRTEST_VOID) {
7281     /* Use full Unicode fold matching */
7282     char *tmpeol = reginfo->strend;
7283     STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
7284     while (hardcount < max
7285       && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
7286            STRING(p), NULL, pat_len,
7287            reginfo->is_utf8_pat, utf8_flags))
7288     {
7289      scan = tmpeol;
7290      tmpeol = reginfo->strend;
7291      hardcount++;
7292     }
7293    }
7294    else if (utf8_target) {
7295     if (c1 == c2) {
7296      while (scan < loceol
7297       && hardcount < max
7298       && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
7299      {
7300       scan += UTF8SKIP(scan);
7301       hardcount++;
7302      }
7303     }
7304     else {
7305      while (scan < loceol
7306       && hardcount < max
7307       && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
7308        || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
7309      {
7310       scan += UTF8SKIP(scan);
7311       hardcount++;
7312      }
7313     }
7314    }
7315    else if (c1 == c2) {
7316     while (scan < loceol && UCHARAT(scan) == c1) {
7317      scan++;
7318     }
7319    }
7320    else {
7321     while (scan < loceol &&
7322      (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
7323     {
7324      scan++;
7325     }
7326    }
7327   }
7328   break;
7329  }
7330  case ANYOF:
7331   if (utf8_target) {
7332    while (hardcount < max
7333     && scan < loceol
7334     && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
7335    {
7336     scan += UTF8SKIP(scan);
7337     hardcount++;
7338    }
7339   } else {
7340    while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
7341     scan++;
7342   }
7343   break;
7344
7345  /* The argument (FLAGS) to all the POSIX node types is the class number */
7346
7347  case NPOSIXL:
7348   to_complement = 1;
7349   /* FALLTHROUGH */
7350
7351  case POSIXL:
7352   if (! utf8_target) {
7353    while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7354                 *scan)))
7355    {
7356     scan++;
7357    }
7358   } else {
7359    while (hardcount < max && scan < loceol
7360     && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7361                 (U8 *) scan)))
7362    {
7363     scan += UTF8SKIP(scan);
7364     hardcount++;
7365    }
7366   }
7367   break;
7368
7369  case POSIXD:
7370   if (utf8_target) {
7371    goto utf8_posix;
7372   }
7373   /* FALLTHROUGH */
7374
7375  case POSIXA:
7376   if (utf8_target && loceol - scan > max) {
7377
7378    /* We didn't adjust <loceol> at the beginning of this routine
7379    * because is UTF-8, but it is actually ok to do so, since here, to
7380    * match, 1 char == 1 byte. */
7381    loceol = scan + max;
7382   }
7383   while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
7384    scan++;
7385   }
7386   break;
7387
7388  case NPOSIXD:
7389   if (utf8_target) {
7390    to_complement = 1;
7391    goto utf8_posix;
7392   }
7393   /* FALLTHROUGH */
7394
7395  case NPOSIXA:
7396   if (! utf8_target) {
7397    while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
7398     scan++;
7399    }
7400   }
7401   else {
7402
7403    /* The complement of something that matches only ASCII matches all
7404    * non-ASCII, plus everything in ASCII that isn't in the class. */
7405    while (hardcount < max && scan < loceol
7406     && (! isASCII_utf8(scan)
7407      || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7408    {
7409     scan += UTF8SKIP(scan);
7410     hardcount++;
7411    }
7412   }
7413   break;
7414
7415  case NPOSIXU:
7416   to_complement = 1;
7417   /* FALLTHROUGH */
7418
7419  case POSIXU:
7420   if (! utf8_target) {
7421    while (scan < loceol && to_complement
7422         ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7423    {
7424     scan++;
7425    }
7426   }
7427   else {
7428  utf8_posix:
7429    classnum = (_char_class_number) FLAGS(p);
7430    if (classnum < _FIRST_NON_SWASH_CC) {
7431
7432     /* Here, a swash is needed for above-Latin1 code points.
7433     * Process as many Latin1 code points using the built-in rules.
7434     * Go to another loop to finish processing upon encountering
7435     * the first Latin1 code point.  We could do that in this loop
7436     * as well, but the other way saves having to test if the swash
7437     * has been loaded every time through the loop: extra space to
7438     * save a test. */
7439     while (hardcount < max && scan < loceol) {
7440      if (UTF8_IS_INVARIANT(*scan)) {
7441       if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7442                 classnum))))
7443       {
7444        break;
7445       }
7446       scan++;
7447      }
7448      else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7449       if (! (to_complement
7450        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7451                  *(scan + 1)),
7452              classnum))))
7453       {
7454        break;
7455       }
7456       scan += 2;
7457      }
7458      else {
7459       goto found_above_latin1;
7460      }
7461
7462      hardcount++;
7463     }
7464    }
7465    else {
7466     /* For these character classes, the knowledge of how to handle
7467     * every code point is compiled in to Perl via a macro.  This
7468     * code is written for making the loops as tight as possible.
7469     * It could be refactored to save space instead */
7470     switch (classnum) {
7471      case _CC_ENUM_SPACE:    /* XXX would require separate code
7472            if we revert the change of \v
7473            matching this */
7474       /* FALLTHROUGH */
7475      case _CC_ENUM_PSXSPC:
7476       while (hardcount < max
7477        && scan < loceol
7478        && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7479       {
7480        scan += UTF8SKIP(scan);
7481        hardcount++;
7482       }
7483       break;
7484      case _CC_ENUM_BLANK:
7485       while (hardcount < max
7486        && scan < loceol
7487        && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7488       {
7489        scan += UTF8SKIP(scan);
7490        hardcount++;
7491       }
7492       break;
7493      case _CC_ENUM_XDIGIT:
7494       while (hardcount < max
7495        && scan < loceol
7496        && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7497       {
7498        scan += UTF8SKIP(scan);
7499        hardcount++;
7500       }
7501       break;
7502      case _CC_ENUM_VERTSPACE:
7503       while (hardcount < max
7504        && scan < loceol
7505        && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7506       {
7507        scan += UTF8SKIP(scan);
7508        hardcount++;
7509       }
7510       break;
7511      case _CC_ENUM_CNTRL:
7512       while (hardcount < max
7513        && scan < loceol
7514        && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7515       {
7516        scan += UTF8SKIP(scan);
7517        hardcount++;
7518       }
7519       break;
7520      default:
7521       Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7522     }
7523    }
7524   }
7525   break;
7526
7527  found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7528
7529   /* Load the swash if not already present */
7530   if (! PL_utf8_swash_ptrs[classnum]) {
7531    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7532    PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7533           "utf8",
7534           "",
7535           &PL_sv_undef, 1, 0,
7536           PL_XPosix_ptrs[classnum], &flags);
7537   }
7538
7539   while (hardcount < max && scan < loceol
7540    && to_complement ^ cBOOL(_generic_utf8(
7541          classnum,
7542          scan,
7543          swash_fetch(PL_utf8_swash_ptrs[classnum],
7544             (U8 *) scan,
7545             TRUE))))
7546   {
7547    scan += UTF8SKIP(scan);
7548    hardcount++;
7549   }
7550   break;
7551
7552  case LNBREAK:
7553   if (utf8_target) {
7554    while (hardcount < max && scan < loceol &&
7555      (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7556     scan += c;
7557     hardcount++;
7558    }
7559   } else {
7560    /* LNBREAK can match one or two latin chars, which is ok, but we
7561    * have to use hardcount in this situation, and throw away the
7562    * adjustment to <loceol> done before the switch statement */
7563    loceol = reginfo->strend;
7564    while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7565     scan+=c;
7566     hardcount++;
7567    }
7568   }
7569   break;
7570
7571  case BOUND:
7572  case BOUNDA:
7573  case BOUNDL:
7574  case BOUNDU:
7575  case EOS:
7576  case GPOS:
7577  case KEEPS:
7578  case NBOUND:
7579  case NBOUNDA:
7580  case NBOUNDL:
7581  case NBOUNDU:
7582  case OPFAIL:
7583  case SBOL:
7584  case SEOL:
7585   /* These are all 0 width, so match right here or not at all. */
7586   break;
7587
7588  default:
7589   Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7590   /* NOTREACHED */
7591   assert(0);
7592
7593  }
7594
7595  if (hardcount)
7596   c = hardcount;
7597  else
7598   c = scan - *startposp;
7599  *startposp = scan;
7600
7601  DEBUG_r({
7602   GET_RE_DEBUG_FLAGS_DECL;
7603   DEBUG_EXECUTE_r({
7604    SV * const prop = sv_newmortal();
7605    regprop(prog, prop, p, reginfo);
7606    PerlIO_printf(Perl_debug_log,
7607       "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7608       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7609   });
7610  });
7611
7612  return(c);
7613 }
7614
7615
7616 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7617 /*
7618 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7619 create a copy so that changes the caller makes won't change the shared one.
7620 If <altsvp> is non-null, will return NULL in it, for back-compat.
7621  */
7622 SV *
7623 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7624 {
7625  PERL_ARGS_ASSERT_REGCLASS_SWASH;
7626
7627  if (altsvp) {
7628   *altsvp = NULL;
7629  }
7630
7631  return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
7632 }
7633
7634 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
7635
7636 /*
7637  - reginclass - determine if a character falls into a character class
7638
7639   n is the ANYOF regnode
7640   p is the target string
7641   p_end points to one byte beyond the end of the target string
7642   utf8_target tells whether p is in UTF-8.
7643
7644   Returns true if matched; false otherwise.
7645
7646   Note that this can be a synthetic start class, a combination of various
7647   nodes, so things you think might be mutually exclusive, such as locale,
7648   aren't.  It can match both locale and non-locale
7649
7650  */
7651
7652 STATIC bool
7653 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
7654 {
7655  dVAR;
7656  const char flags = ANYOF_FLAGS(n);
7657  bool match = FALSE;
7658  UV c = *p;
7659
7660  PERL_ARGS_ASSERT_REGINCLASS;
7661
7662  /* If c is not already the code point, get it.  Note that
7663  * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7664  if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7665   STRLEN c_len = 0;
7666   c = utf8n_to_uvchr(p, p_end - p, &c_len,
7667     (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7668     | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7669     /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7670     * UTF8_ALLOW_FFFF */
7671   if (c_len == (STRLEN)-1)
7672    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7673  }
7674
7675  /* If this character is potentially in the bitmap, check it */
7676  if (c < NUM_ANYOF_CODE_POINTS) {
7677   if (ANYOF_BITMAP_TEST(n, c))
7678    match = TRUE;
7679   else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
7680     && ! utf8_target
7681     && ! isASCII(c))
7682   {
7683    match = TRUE;
7684   }
7685   else if (flags & ANYOF_LOCALE_FLAGS) {
7686    if ((flags & ANYOF_LOC_FOLD)
7687     && c < 256
7688     && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7689    {
7690     match = TRUE;
7691    }
7692    else if (ANYOF_POSIXL_TEST_ANY_SET(n)
7693      && c < 256
7694    ) {
7695
7696     /* The data structure is arranged so bits 0, 2, 4, ... are set
7697     * if the class includes the Posix character class given by
7698     * bit/2; and 1, 3, 5, ... are set if the class includes the
7699     * complemented Posix class given by int(bit/2).  So we loop
7700     * through the bits, each time changing whether we complement
7701     * the result or not.  Suppose for the sake of illustration
7702     * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7703     * is set, it means there is a match for this ANYOF node if the
7704     * character is in the class given by the expression (0 / 2 = 0
7705     * = \w).  If it is in that class, isFOO_lc() will return 1,
7706     * and since 'to_complement' is 0, the result will stay TRUE,
7707     * and we exit the loop.  Suppose instead that bit 0 is 0, but
7708     * bit 1 is 1.  That means there is a match if the character
7709     * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7710     * but will on bit 1.  On the second iteration 'to_complement'
7711     * will be 1, so the exclusive or will reverse things, so we
7712     * are testing for \W.  On the third iteration, 'to_complement'
7713     * will be 0, and we would be testing for \s; the fourth
7714     * iteration would test for \S, etc.
7715     *
7716     * Note that this code assumes that all the classes are closed
7717     * under folding.  For example, if a character matches \w, then
7718     * its fold does too; and vice versa.  This should be true for
7719     * any well-behaved locale for all the currently defined Posix
7720     * classes, except for :lower: and :upper:, which are handled
7721     * by the pseudo-class :cased: which matches if either of the
7722     * other two does.  To get rid of this assumption, an outer
7723     * loop could be used below to iterate over both the source
7724     * character, and its fold (if different) */
7725
7726     int count = 0;
7727     int to_complement = 0;
7728
7729     while (count < ANYOF_MAX) {
7730      if (ANYOF_POSIXL_TEST(n, count)
7731       && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7732      {
7733       match = TRUE;
7734       break;
7735      }
7736      count++;
7737      to_complement ^= 1;
7738     }
7739    }
7740   }
7741  }
7742
7743
7744  /* If the bitmap didn't (or couldn't) match, and something outside the
7745  * bitmap could match, try that. */
7746  if (!match) {
7747   if (c >= NUM_ANYOF_CODE_POINTS
7748    && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
7749   {
7750    match = TRUE; /* Everything above the bitmap matches */
7751   }
7752   else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
7753     || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
7754     || ((flags & ANYOF_LOC_FOLD)
7755      && IN_UTF8_CTYPE_LOCALE
7756      && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
7757   {
7758    SV* only_utf8_locale = NULL;
7759    SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
7760              &only_utf8_locale, NULL);
7761    if (sw) {
7762     U8 utf8_buffer[2];
7763     U8 * utf8_p;
7764     if (utf8_target) {
7765      utf8_p = (U8 *) p;
7766     } else { /* Convert to utf8 */
7767      utf8_p = utf8_buffer;
7768      append_utf8_from_native_byte(*p, &utf8_p);
7769      utf8_p = utf8_buffer;
7770     }
7771
7772     if (swash_fetch(sw, utf8_p, TRUE)) {
7773      match = TRUE;
7774     }
7775    }
7776    if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
7777     match = _invlist_contains_cp(only_utf8_locale, c);
7778    }
7779   }
7780
7781   if (UNICODE_IS_SUPER(c)
7782    && (flags & ANYOF_WARN_SUPER)
7783    && ckWARN_d(WARN_NON_UNICODE))
7784   {
7785    Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7786     "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
7787   }
7788  }
7789
7790 #if ANYOF_INVERT != 1
7791  /* Depending on compiler optimization cBOOL takes time, so if don't have to
7792  * use it, don't */
7793 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
7794 #endif
7795
7796  /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7797  return (flags & ANYOF_INVERT) ^ match;
7798 }
7799
7800 STATIC U8 *
7801 S_reghop3(U8 *s, SSize_t off, const U8* lim)
7802 {
7803  /* return the position 'off' UTF-8 characters away from 's', forward if
7804  * 'off' >= 0, backwards if negative.  But don't go outside of position
7805  * 'lim', which better be < s  if off < 0 */
7806
7807  PERL_ARGS_ASSERT_REGHOP3;
7808
7809  if (off >= 0) {
7810   while (off-- && s < lim) {
7811    /* XXX could check well-formedness here */
7812    s += UTF8SKIP(s);
7813   }
7814  }
7815  else {
7816   while (off++ && s > lim) {
7817    s--;
7818    if (UTF8_IS_CONTINUED(*s)) {
7819     while (s > lim && UTF8_IS_CONTINUATION(*s))
7820      s--;
7821    }
7822    /* XXX could check well-formedness here */
7823   }
7824  }
7825  return s;
7826 }
7827
7828 STATIC U8 *
7829 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
7830 {
7831  PERL_ARGS_ASSERT_REGHOP4;
7832
7833  if (off >= 0) {
7834   while (off-- && s < rlim) {
7835    /* XXX could check well-formedness here */
7836    s += UTF8SKIP(s);
7837   }
7838  }
7839  else {
7840   while (off++ && s > llim) {
7841    s--;
7842    if (UTF8_IS_CONTINUED(*s)) {
7843     while (s > llim && UTF8_IS_CONTINUATION(*s))
7844      s--;
7845    }
7846    /* XXX could check well-formedness here */
7847   }
7848  }
7849  return s;
7850 }
7851
7852 /* like reghop3, but returns NULL on overrun, rather than returning last
7853  * char pos */
7854
7855 STATIC U8 *
7856 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
7857 {
7858  PERL_ARGS_ASSERT_REGHOPMAYBE3;
7859
7860  if (off >= 0) {
7861   while (off-- && s < lim) {
7862    /* XXX could check well-formedness here */
7863    s += UTF8SKIP(s);
7864   }
7865   if (off >= 0)
7866    return NULL;
7867  }
7868  else {
7869   while (off++ && s > lim) {
7870    s--;
7871    if (UTF8_IS_CONTINUED(*s)) {
7872     while (s > lim && UTF8_IS_CONTINUATION(*s))
7873      s--;
7874    }
7875    /* XXX could check well-formedness here */
7876   }
7877   if (off <= 0)
7878    return NULL;
7879  }
7880  return s;
7881 }
7882
7883
7884 /* when executing a regex that may have (?{}), extra stuff needs setting
7885    up that will be visible to the called code, even before the current
7886    match has finished. In particular:
7887
7888    * $_ is localised to the SV currently being matched;
7889    * pos($_) is created if necessary, ready to be updated on each call-out
7890  to code;
7891    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7892  isn't set until the current pattern is successfully finished), so that
7893  $1 etc of the match-so-far can be seen;
7894    * save the old values of subbeg etc of the current regex, and  set then
7895  to the current string (again, this is normally only done at the end
7896  of execution)
7897 */
7898
7899 static void
7900 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7901 {
7902  MAGIC *mg;
7903  regexp *const rex = ReANY(reginfo->prog);
7904  regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7905
7906  eval_state->rex = rex;
7907
7908  if (reginfo->sv) {
7909   /* Make $_ available to executed code. */
7910   if (reginfo->sv != DEFSV) {
7911    SAVE_DEFSV;
7912    DEFSV_set(reginfo->sv);
7913   }
7914
7915   if (!(mg = mg_find_mglob(reginfo->sv))) {
7916    /* prepare for quick setting of pos */
7917    mg = sv_magicext_mglob(reginfo->sv);
7918    mg->mg_len = -1;
7919   }
7920   eval_state->pos_magic = mg;
7921   eval_state->pos       = mg->mg_len;
7922   eval_state->pos_flags = mg->mg_flags;
7923  }
7924  else
7925   eval_state->pos_magic = NULL;
7926
7927  if (!PL_reg_curpm) {
7928   /* PL_reg_curpm is a fake PMOP that we can attach the current
7929   * regex to and point PL_curpm at, so that $1 et al are visible
7930   * within a /(?{})/. It's just allocated once per interpreter the
7931   * first time its needed */
7932   Newxz(PL_reg_curpm, 1, PMOP);
7933 #ifdef USE_ITHREADS
7934   {
7935    SV* const repointer = &PL_sv_undef;
7936    /* this regexp is also owned by the new PL_reg_curpm, which
7937    will try to free it.  */
7938    av_push(PL_regex_padav, repointer);
7939    PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
7940    PL_regex_pad = AvARRAY(PL_regex_padav);
7941   }
7942 #endif
7943  }
7944  SET_reg_curpm(reginfo->prog);
7945  eval_state->curpm = PL_curpm;
7946  PL_curpm = PL_reg_curpm;
7947  if (RXp_MATCH_COPIED(rex)) {
7948   /*  Here is a serious problem: we cannot rewrite subbeg,
7949    since it may be needed if this match fails.  Thus
7950    $` inside (?{}) could fail... */
7951   eval_state->subbeg     = rex->subbeg;
7952   eval_state->sublen     = rex->sublen;
7953   eval_state->suboffset  = rex->suboffset;
7954   eval_state->subcoffset = rex->subcoffset;
7955 #ifdef PERL_ANY_COW
7956   eval_state->saved_copy = rex->saved_copy;
7957 #endif
7958   RXp_MATCH_COPIED_off(rex);
7959  }
7960  else
7961   eval_state->subbeg = NULL;
7962  rex->subbeg = (char *)reginfo->strbeg;
7963  rex->suboffset = 0;
7964  rex->subcoffset = 0;
7965  rex->sublen = reginfo->strend - reginfo->strbeg;
7966 }
7967
7968
7969 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
7970
7971 static void
7972 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
7973 {
7974  regmatch_info_aux *aux = (regmatch_info_aux *) arg;
7975  regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
7976  regmatch_slab *s;
7977
7978  Safefree(aux->poscache);
7979
7980  if (eval_state) {
7981
7982   /* undo the effects of S_setup_eval_state() */
7983
7984   if (eval_state->subbeg) {
7985    regexp * const rex = eval_state->rex;
7986    rex->subbeg     = eval_state->subbeg;
7987    rex->sublen     = eval_state->sublen;
7988    rex->suboffset  = eval_state->suboffset;
7989    rex->subcoffset = eval_state->subcoffset;
7990 #ifdef PERL_ANY_COW
7991    rex->saved_copy = eval_state->saved_copy;
7992 #endif
7993    RXp_MATCH_COPIED_on(rex);
7994   }
7995   if (eval_state->pos_magic)
7996   {
7997    eval_state->pos_magic->mg_len = eval_state->pos;
7998    eval_state->pos_magic->mg_flags =
7999     (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8000    | (eval_state->pos_flags & MGf_BYTES);
8001   }
8002
8003   PL_curpm = eval_state->curpm;
8004  }
8005
8006  PL_regmatch_state = aux->old_regmatch_state;
8007  PL_regmatch_slab  = aux->old_regmatch_slab;
8008
8009  /* free all slabs above current one - this must be the last action
8010  * of this function, as aux and eval_state are allocated within
8011  * slabs and may be freed here */
8012
8013  s = PL_regmatch_slab->next;
8014  if (s) {
8015   PL_regmatch_slab->next = NULL;
8016   while (s) {
8017    regmatch_slab * const osl = s;
8018    s = s->next;
8019    Safefree(osl);
8020   }
8021  }
8022 }
8023
8024
8025 STATIC void
8026 S_to_utf8_substr(pTHX_ regexp *prog)
8027 {
8028  /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8029  * on the converted value */
8030
8031  int i = 1;
8032
8033  PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8034
8035  do {
8036   if (prog->substrs->data[i].substr
8037    && !prog->substrs->data[i].utf8_substr) {
8038    SV* const sv = newSVsv(prog->substrs->data[i].substr);
8039    prog->substrs->data[i].utf8_substr = sv;
8040    sv_utf8_upgrade(sv);
8041    if (SvVALID(prog->substrs->data[i].substr)) {
8042     if (SvTAIL(prog->substrs->data[i].substr)) {
8043      /* Trim the trailing \n that fbm_compile added last
8044      time.  */
8045      SvCUR_set(sv, SvCUR(sv) - 1);
8046      /* Whilst this makes the SV technically "invalid" (as its
8047      buffer is no longer followed by "\0") when fbm_compile()
8048      adds the "\n" back, a "\0" is restored.  */
8049      fbm_compile(sv, FBMcf_TAIL);
8050     } else
8051      fbm_compile(sv, 0);
8052    }
8053    if (prog->substrs->data[i].substr == prog->check_substr)
8054     prog->check_utf8 = sv;
8055   }
8056  } while (i--);
8057 }
8058
8059 STATIC bool
8060 S_to_byte_substr(pTHX_ regexp *prog)
8061 {
8062  /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8063  * on the converted value; returns FALSE if can't be converted. */
8064
8065  int i = 1;
8066
8067  PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8068
8069  do {
8070   if (prog->substrs->data[i].utf8_substr
8071    && !prog->substrs->data[i].substr) {
8072    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
8073    if (! sv_utf8_downgrade(sv, TRUE)) {
8074     return FALSE;
8075    }
8076    if (SvVALID(prog->substrs->data[i].utf8_substr)) {
8077     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
8078      /* Trim the trailing \n that fbm_compile added last
8079       time.  */
8080      SvCUR_set(sv, SvCUR(sv) - 1);
8081      fbm_compile(sv, FBMcf_TAIL);
8082     } else
8083      fbm_compile(sv, 0);
8084    }
8085    prog->substrs->data[i].substr = sv;
8086    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
8087     prog->check_substr = sv;
8088   }
8089  } while (i--);
8090
8091  return TRUE;
8092 }
8093
8094 /*
8095  * Local variables:
8096  * c-indentation-style: bsd
8097  * c-basic-offset: 4
8098  * indent-tabs-mode: nil
8099  * End:
8100  *
8101  * ex: set ts=8 sts=4 sw=4 et:
8102  */