src/5021003/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *  One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  * Copyright (c) 1986 by University of Toronto.
  44  * Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  * Permission is granted to anyone to use this software for any
  47  * purpose on any computer system, and to redistribute it freely,
  48  * subject to the following restrictions:
  49  *
  50  * 1. The author is not responsible for the consequences of use of
  51  *  this software, no matter how awful, even if they arise
  52  *  from defects in it.
  53  *
  54  * 2. The origin of this software must not be misrepresented, either
  55  *  by explicit claim or by omission.
  56  *
  57  * 3. Altered versions must be plainly marked as such, and must not
  58  *  be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #include "perl.h"
  76 #include "re_defs.h"
  77
  78 #ifdef PERL_IN_XSUB_RE
  79 #  include "re_comp.h"
  80 #else
  81 #  include "regcomp.h"
  82 #endif
  83
  84 #include "inline_invlist.c"
  85 #include "unicode_constants.h"
  86
  87 #ifdef DEBUGGING
  88 /* At least one required character in the target string is expressible only in
  89  * UTF-8. */
  90 static const char* const non_utf8_target_but_utf8_required
  91     = "Can't match, because target string needs to be in UTF-8\n";
  92 #endif
  93
  94 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  95  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  96  goto target; \
  97 } STMT_END
  98
  99 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 100
 101 #ifndef STATIC
 102 #define STATIC static
 103 #endif
 104
 105 /* Valid only for non-utf8 strings: avoids the reginclass
 106  * call if there are no complications: i.e., if everything matchable is
 107  * straight forward in the bitmap */
 108 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0)   \
 109            : ANYOF_BITMAP_TEST(p,*(c)))
 110
 111 /*
 112  * Forwards.
 113  */
 114
 115 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 116 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 117
 118 #define HOPc(pos,off) \
 119   (char *)(reginfo->is_utf8_target \
 120    ? reghop3((U8*)pos, off, \
 121      (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 122    : (U8*)(pos + off))
 123
 124 #define HOPBACKc(pos, off) \
 125   (char*)(reginfo->is_utf8_target \
 126    ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 127    : (pos - off >= reginfo->strbeg) \
 128     ? (U8*)pos - off  \
 129     : NULL)
 130
 131 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 132 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 133
 134 /* lim must be +ve. Returns NULL on overshoot */
 135 #define HOPMAYBE3(pos,off,lim) \
 136   (reginfo->is_utf8_target                        \
 137    ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 138    : ((U8*)pos + off <= lim)                   \
 139     ? (U8*)pos + off                        \
 140     : NULL)
 141
 142 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 143  * off must be >=0; args should be vars rather than expressions */
 144 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 145  ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 146  : (U8*)((pos + off) > lim ? lim : (pos + off)))
 147
 148 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 149  ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 150  : (U8*)(pos + off))
 151 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 152
 153 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 154 #define NEXTCHR_IS_EOS (nextchr < 0)
 155
 156 #define SET_nextchr \
 157  nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 158
 159 #define SET_locinput(p) \
 160  locinput = (p);  \
 161  SET_nextchr
 162
 163
 164 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 165   if (!swash_ptr) {                                                     \
 166    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 167    swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 168           1, 0, invlist, &flags);              \
 169    assert(swash_ptr);                                                \
 170   }                                                                     \
 171  } STMT_END
 172
 173 /* If in debug mode, we test that a known character properly matches */
 174 #ifdef DEBUGGING
 175 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 176           property_name,                      \
 177           invlist,                            \
 178           utf8_char_in_property)              \
 179   LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 180   assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 181 #else
 182 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 183           property_name,                      \
 184           invlist,                            \
 185           utf8_char_in_property)              \
 186   LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 187 #endif
 188
 189 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 190           PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 191           "",                                   \
 192           PL_XPosix_ptrs[_CC_WORDCHAR],         \
 193           LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
 194
 195 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 196  STMT_START {                                                              \
 197   LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 198          "_X_regular_begin",                    \
 199          NULL,                                  \
 200          LATIN_CAPITAL_LETTER_SHARP_S_UTF8);    \
 201   LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 202          "_X_extend",                           \
 203          NULL,                                  \
 204          COMBINING_GRAVE_ACCENT_UTF8);          \
 205  } STMT_END
 206
 207 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
 208 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 209
 210 /* for use after a quantifier and before an EXACT-like node -- japhy */
 211 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 212  *
 213  * NOTE that *nothing* that affects backtracking should be in here, specifically
 214  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 215  * node that is in between two EXACT like nodes when ascertaining what the required
 216  * "follow" character is. This should probably be moved to regex compile time
 217  * although it may be done at run time beause of the REF possibility - more
 218  * investigation required. -- demerphq
 219 */
 220 #define JUMPABLE(rn) (                                                             \
 221  OP(rn) == OPEN ||                                                              \
 222  (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 223  OP(rn) == EVAL ||                                                              \
 224  OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 225  OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 226  OP(rn) == KEEPS ||                                                             \
 227  (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 228 )
 229 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 230
 231 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 232
 233 #if 0
 234 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 235    we don't need this definition. */
 236 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 237 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 238 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 239
 240 #else
 241 /* ... so we use this as its faster. */
 242 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 243 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 244 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 245 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 246
 247 #endif
 248
 249 /*
 250   Search for mandatory following text node; for lookahead, the text must
 251   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 252 */
 253 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 254  while (JUMPABLE(rn)) { \
 255   const OPCODE type = OP(rn); \
 256   if (type == SUSPEND || PL_regkind[type] == CURLY) \
 257    rn = NEXTOPER(NEXTOPER(rn)); \
 258   else if (type == PLUS) \
 259    rn = NEXTOPER(rn); \
 260   else if (type == IFMATCH) \
 261    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 262   else rn += NEXT_OFF(rn); \
 263  } \
 264 } STMT_END
 265
 266 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 267  * These are for the pre-composed Hangul syllables, which are all in a
 268  * contiguous block and arranged there in such a way so as to facilitate
 269  * alorithmic determination of their characteristics.  As such, they don't need
 270  * a swash, but can be determined by simple arithmetic.  Almost all are
 271  * GCB=LVT, but every 28th one is a GCB=LV */
 272 #define SBASE 0xAC00    /* Start of block */
 273 #define SCount 11172    /* Length of block */
 274 #define TCount 28
 275
 276 #define SLAB_FIRST(s) (&(s)->states[0])
 277 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 278
 279 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 280 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 281 static regmatch_state * S_push_slab(pTHX);
 282
 283 #define REGCP_PAREN_ELEMS 3
 284 #define REGCP_OTHER_ELEMS 3
 285 #define REGCP_FRAME_ELEMS 1
 286 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 287  * are needed for the regexp context stack bookkeeping. */
 288
 289 STATIC CHECKPOINT
 290 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 291 {
 292  const int retval = PL_savestack_ix;
 293  const int paren_elems_to_push =
 294     (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 295  const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 296  const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 297  I32 p;
 298  GET_RE_DEBUG_FLAGS_DECL;
 299
 300  PERL_ARGS_ASSERT_REGCPPUSH;
 301
 302  if (paren_elems_to_push < 0)
 303   Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 304     (int)paren_elems_to_push, (int)maxopenparen,
 305     (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 306
 307  if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 308   Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 309     " out of range (%lu-%ld)",
 310     total_elems,
 311     (unsigned long)maxopenparen,
 312     (long)parenfloor);
 313
 314  SSGROW(total_elems + REGCP_FRAME_ELEMS);
 315
 316  DEBUG_BUFFERS_r(
 317   if ((int)maxopenparen > (int)parenfloor)
 318    PerlIO_printf(Perl_debug_log,
 319     "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 320     PTR2UV(rex),
 321     PTR2UV(rex->offs)
 322    );
 323  );
 324  for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 325 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 326   SSPUSHIV(rex->offs[p].end);
 327   SSPUSHIV(rex->offs[p].start);
 328   SSPUSHINT(rex->offs[p].start_tmp);
 329   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 330    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 331    (UV)p,
 332    (IV)rex->offs[p].start,
 333    (IV)rex->offs[p].start_tmp,
 334    (IV)rex->offs[p].end
 335   ));
 336  }
 337 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 338  SSPUSHINT(maxopenparen);
 339  SSPUSHINT(rex->lastparen);
 340  SSPUSHINT(rex->lastcloseparen);
 341  SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 342
 343  return retval;
 344 }
 345
 346 /* These are needed since we do not localize EVAL nodes: */
 347 #define REGCP_SET(cp)                                           \
 348  DEBUG_STATE_r(                                              \
 349    PerlIO_printf(Perl_debug_log,          \
 350     "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 351     (IV)PL_savestack_ix));                          \
 352  cp = PL_savestack_ix
 353
 354 #define REGCP_UNWIND(cp)                                        \
 355  DEBUG_STATE_r(                                              \
 356   if (cp != PL_savestack_ix)                   \
 357     PerlIO_printf(Perl_debug_log,          \
 358     "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 359     (IV)(cp), (IV)PL_savestack_ix));                \
 360  regcpblow(cp)
 361
 362 #define UNWIND_PAREN(lp, lcp)               \
 363  for (n = rex->lastparen; n > lp; n--)   \
 364   rex->offs[n].end = -1;              \
 365  rex->lastparen = n;                     \
 366  rex->lastcloseparen = lcp;
 367
 368
 369 STATIC void
 370 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 371 {
 372  UV i;
 373  U32 paren;
 374  GET_RE_DEBUG_FLAGS_DECL;
 375
 376  PERL_ARGS_ASSERT_REGCPPOP;
 377
 378  /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 379  i = SSPOPUV;
 380  assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 381  i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 382  rex->lastcloseparen = SSPOPINT;
 383  rex->lastparen = SSPOPINT;
 384  *maxopenparen_p = SSPOPINT;
 385
 386  i -= REGCP_OTHER_ELEMS;
 387  /* Now restore the parentheses context. */
 388  DEBUG_BUFFERS_r(
 389   if (i || rex->lastparen + 1 <= rex->nparens)
 390    PerlIO_printf(Perl_debug_log,
 391     "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 392     PTR2UV(rex),
 393     PTR2UV(rex->offs)
 394    );
 395  );
 396  paren = *maxopenparen_p;
 397  for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 398   SSize_t tmps;
 399   rex->offs[paren].start_tmp = SSPOPINT;
 400   rex->offs[paren].start = SSPOPIV;
 401   tmps = SSPOPIV;
 402   if (paren <= rex->lastparen)
 403    rex->offs[paren].end = tmps;
 404   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 405    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 406    (UV)paren,
 407    (IV)rex->offs[paren].start,
 408    (IV)rex->offs[paren].start_tmp,
 409    (IV)rex->offs[paren].end,
 410    (paren > rex->lastparen ? "(skipped)" : ""));
 411   );
 412   paren--;
 413  }
 414 #if 1
 415  /* It would seem that the similar code in regtry()
 416  * already takes care of this, and in fact it is in
 417  * a better location to since this code can #if 0-ed out
 418  * but the code in regtry() is needed or otherwise tests
 419  * requiring null fields (pat.t#187 and split.t#{13,14}
 420  * (as of patchlevel 7877)  will fail.  Then again,
 421  * this code seems to be necessary or otherwise
 422  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 423  * --jhi updated by dapm */
 424  for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 425   if (i > *maxopenparen_p)
 426    rex->offs[i].start = -1;
 427   rex->offs[i].end = -1;
 428   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 429    "    \\%"UVuf": %s   ..-1 undeffing\n",
 430    (UV)i,
 431    (i > *maxopenparen_p) ? "-1" : "  "
 432   ));
 433  }
 434 #endif
 435 }
 436
 437 /* restore the parens and associated vars at savestack position ix,
 438  * but without popping the stack */
 439
 440 STATIC void
 441 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 442 {
 443  I32 tmpix = PL_savestack_ix;
 444  PL_savestack_ix = ix;
 445  regcppop(rex, maxopenparen_p);
 446  PL_savestack_ix = tmpix;
 447 }
 448
 449 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
 450
 451 STATIC bool
 452 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 453 {
 454  /* Returns a boolean as to whether or not 'character' is a member of the
 455  * Posix character class given by 'classnum' that should be equivalent to a
 456  * value in the typedef '_char_class_number'.
 457  *
 458  * Ideally this could be replaced by a just an array of function pointers
 459  * to the C library functions that implement the macros this calls.
 460  * However, to compile, the precise function signatures are required, and
 461  * these may vary from platform to to platform.  To avoid having to figure
 462  * out what those all are on each platform, I (khw) am using this method,
 463  * which adds an extra layer of function call overhead (unless the C
 464  * optimizer strips it away).  But we don't particularly care about
 465  * performance with locales anyway. */
 466
 467  switch ((_char_class_number) classnum) {
 468   case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 469   case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 470   case _CC_ENUM_ASCII:     return isASCII_LC(character);
 471   case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 472   case _CC_ENUM_CASED:     return isLOWER_LC(character)
 473           || isUPPER_LC(character);
 474   case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 475   case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 476   case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 477   case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 478   case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 479   case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 480   case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 481   case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 482   case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 483   case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 484   case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 485   default:    /* VERTSPACE should never occur in locales */
 486    Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 487  }
 488
 489  assert(0); /* NOTREACHED */
 490  return FALSE;
 491 }
 492
 493 STATIC bool
 494 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 495 {
 496  /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 497  * 'character' is a member of the Posix character class given by 'classnum'
 498  * that should be equivalent to a value in the typedef
 499  * '_char_class_number'.
 500  *
 501  * This just calls isFOO_lc on the code point for the character if it is in
 502  * the range 0-255.  Outside that range, all characters avoid Unicode
 503  * rules, ignoring any locale.  So use the Unicode function if this class
 504  * requires a swash, and use the Unicode macro otherwise. */
 505
 506  PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 507
 508  if (UTF8_IS_INVARIANT(*character)) {
 509   return isFOO_lc(classnum, *character);
 510  }
 511  else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 512   return isFOO_lc(classnum,
 513       TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
 514  }
 515
 516  if (classnum < _FIRST_NON_SWASH_CC) {
 517
 518   /* Initialize the swash unless done already */
 519   if (! PL_utf8_swash_ptrs[classnum]) {
 520    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 521    PL_utf8_swash_ptrs[classnum] =
 522      _core_swash_init("utf8",
 523          "",
 524          &PL_sv_undef, 1, 0,
 525          PL_XPosix_ptrs[classnum], &flags);
 526   }
 527
 528   return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 529         character,
 530         TRUE /* is UTF */ ));
 531  }
 532
 533  switch ((_char_class_number) classnum) {
 534   case _CC_ENUM_SPACE:
 535   case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 536
 537   case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 538   case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 539   case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 540   default:                 break;
 541  }
 542
 543  return FALSE; /* Things like CNTRL are always below 256 */
 544 }
 545
 546 /*
 547  * pregexec and friends
 548  */
 549
 550 #ifndef PERL_IN_XSUB_RE
 551 /*
 552  - pregexec - match a regexp against a string
 553  */
 554 I32
 555 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 556   char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 557 /* stringarg: the point in the string at which to begin matching */
 558 /* strend:    pointer to null at end of string */
 559 /* strbeg:    real beginning of string */
 560 /* minend:    end of match must be >= minend bytes after stringarg. */
 561 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 562  *            itself is accessed via the pointers above */
 563 /* nosave:    For optimizations. */
 564 {
 565  PERL_ARGS_ASSERT_PREGEXEC;
 566
 567  return
 568   regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 569      nosave ? 0 : REXEC_COPY_STR);
 570 }
 571 #endif
 572
 573
 574
 575 /* re_intuit_start():
 576  *
 577  * Based on some optimiser hints, try to find the earliest position in the
 578  * string where the regex could match.
 579  *
 580  *   rx:     the regex to match against
 581  *   sv:     the SV being matched: only used for utf8 flag; the string
 582  *           itself is accessed via the pointers below. Note that on
 583  *           something like an overloaded SV, SvPOK(sv) may be false
 584  *           and the string pointers may point to something unrelated to
 585  *           the SV itself.
 586  *   strbeg: real beginning of string
 587  *   strpos: the point in the string at which to begin matching
 588  *   strend: pointer to the byte following the last char of the string
 589  *   flags   currently unused; set to 0
 590  *   data:   currently unused; set to NULL
 591  *
 592  * The basic idea of re_intuit_start() is to use some known information
 593  * about the pattern, namely:
 594  *
 595  *   a) the longest known anchored substring (i.e. one that's at a
 596  *      constant offset from the beginning of the pattern; but not
 597  *      necessarily at a fixed offset from the beginning of the
 598  *      string);
 599  *   b) the longest floating substring (i.e. one that's not at a constant
 600  *      offset from the beginning of the pattern);
 601  *   c) Whether the pattern is anchored to the string; either
 602  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 603  *      or anchored to pos(): /\G/;
 604  *   d) A start class: a real or synthetic character class which
 605  *      represents which characters are legal at the start of the pattern;
 606  *
 607  * to either quickly reject the match, or to find the earliest position
 608  * within the string at which the pattern might match, thus avoiding
 609  * running the full NFA engine at those earlier locations, only to
 610  * eventually fail and retry further along.
 611  *
 612  * Returns NULL if the pattern can't match, or returns the address within
 613  * the string which is the earliest place the match could occur.
 614  *
 615  * The longest of the anchored and floating substrings is called 'check'
 616  * and is checked first. The other is called 'other' and is checked
 617  * second. The 'other' substring may not be present.  For example,
 618  *
 619  *    /(abc|xyz)ABC\d{0,3}DEFG/
 620  *
 621  * will have
 622  *
 623  *   check substr (float)    = "DEFG", offset 6..9 chars
 624  *   other substr (anchored) = "ABC",  offset 3..3 chars
 625  *   stclass = [ax]
 626  *
 627  * Be aware that during the course of this function, sometimes 'anchored'
 628  * refers to a substring being anchored relative to the start of the
 629  * pattern, and sometimes to the pattern itself being anchored relative to
 630  * the string. For example:
 631  *
 632  *   /\dabc/:   "abc" is anchored to the pattern;
 633  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 634  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 635  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 636  *                    but the pattern is anchored to the string.
 637  */
 638
 639 char *
 640 Perl_re_intuit_start(pTHX_
 641      REGEXP * const rx,
 642      SV *sv,
 643      const char * const strbeg,
 644      char *strpos,
 645      char *strend,
 646      const U32 flags,
 647      re_scream_pos_data *data)
 648 {
 649  struct regexp *const prog = ReANY(rx);
 650  SSize_t start_shift = prog->check_offset_min;
 651  /* Should be nonnegative! */
 652  SSize_t end_shift   = 0;
 653  /* current lowest pos in string where the regex can start matching */
 654  char *rx_origin = strpos;
 655  SV *check;
 656  const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 657  U8   other_ix = 1 - prog->substrs->check_ix;
 658  bool ml_anch = 0;
 659  char *other_last = strpos;/* latest pos 'other' substr already checked to */
 660  char *check_at = NULL;  /* check substr found at this pos */
 661  const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 662  RXi_GET_DECL(prog,progi);
 663  regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 664  regmatch_info *const reginfo = &reginfo_buf;
 665  GET_RE_DEBUG_FLAGS_DECL;
 666
 667  PERL_ARGS_ASSERT_RE_INTUIT_START;
 668  PERL_UNUSED_ARG(flags);
 669  PERL_UNUSED_ARG(data);
 670
 671  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 672     "Intuit: trying to determine minimum start position...\n"));
 673
 674  /* for now, assume that all substr offsets are positive. If at some point
 675  * in the future someone wants to do clever things with look-behind and
 676  * -ve offsets, they'll need to fix up any code in this function
 677  * which uses these offsets. See the thread beginning
 678  * <20140113145929.GF27210@iabyn.com>
 679  */
 680  assert(prog->substrs->data[0].min_offset >= 0);
 681  assert(prog->substrs->data[0].max_offset >= 0);
 682  assert(prog->substrs->data[1].min_offset >= 0);
 683  assert(prog->substrs->data[1].max_offset >= 0);
 684  assert(prog->substrs->data[2].min_offset >= 0);
 685  assert(prog->substrs->data[2].max_offset >= 0);
 686
 687  /* for now, assume that if both present, that the floating substring
 688  * doesn't start before the anchored substring.
 689  * If you break this assumption (e.g. doing better optimisations
 690  * with lookahead/behind), then you'll need to audit the code in this
 691  * function carefully first
 692  */
 693  assert(
 694    ! (  (prog->anchored_utf8 || prog->anchored_substr)
 695    && (prog->float_utf8    || prog->float_substr))
 696   || (prog->float_min_offset >= prog->anchored_offset));
 697
 698  /* byte rather than char calculation for efficiency. It fails
 699  * to quickly reject some cases that can't match, but will reject
 700  * them later after doing full char arithmetic */
 701  if (prog->minlen > strend - strpos) {
 702   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 703        "  String too short...\n"));
 704   goto fail;
 705  }
 706
 707  reginfo->is_utf8_target = cBOOL(utf8_target);
 708  reginfo->info_aux = NULL;
 709  reginfo->strbeg = strbeg;
 710  reginfo->strend = strend;
 711  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 712  reginfo->intuit = 1;
 713  /* not actually used within intuit, but zero for safety anyway */
 714  reginfo->poscache_maxiter = 0;
 715
 716  if (utf8_target) {
 717   if (!prog->check_utf8 && prog->check_substr)
 718    to_utf8_substr(prog);
 719   check = prog->check_utf8;
 720  } else {
 721   if (!prog->check_substr && prog->check_utf8) {
 722    if (! to_byte_substr(prog)) {
 723     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 724    }
 725   }
 726   check = prog->check_substr;
 727  }
 728
 729  /* dump the various substring data */
 730  DEBUG_OPTIMISE_MORE_r({
 731   int i;
 732   for (i=0; i<=2; i++) {
 733    SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 734         : prog->substrs->data[i].substr);
 735    if (!sv)
 736     continue;
 737
 738    PerlIO_printf(Perl_debug_log,
 739     "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 740     " useful=%"IVdf" utf8=%d [%s]\n",
 741     i,
 742     (IV)prog->substrs->data[i].min_offset,
 743     (IV)prog->substrs->data[i].max_offset,
 744     (IV)prog->substrs->data[i].end_shift,
 745     BmUSEFUL(sv),
 746     utf8_target ? 1 : 0,
 747     SvPEEK(sv));
 748   }
 749  });
 750
 751  if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 752
 753   /* ml_anch: check after \n?
 754   *
 755   * A note about IMPLICIT: on an un-anchored pattern beginning
 756   * with /.*.../, these flags will have been added by the
 757   * compiler:
 758   *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 759   *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 760   */
 761   ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 762     && !(prog->intflags & PREGf_IMPLICIT);
 763
 764   if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 765    /* we are only allowed to match at BOS or \G */
 766
 767    /* trivially reject if there's a BOS anchor and we're not at BOS.
 768    *
 769    * Note that we don't try to do a similar quick reject for
 770    * \G, since generally the caller will have calculated strpos
 771    * based on pos() and gofs, so the string is already correctly
 772    * anchored by definition; and handling the exceptions would
 773    * be too fiddly (e.g. REXEC_IGNOREPOS).
 774    */
 775    if (   strpos != strbeg
 776     && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL)))
 777    {
 778     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 779         "  Not at start...\n"));
 780     goto fail;
 781    }
 782
 783    /* in the presence of an anchor, the anchored (relative to the
 784    * start of the regex) substr must also be anchored relative
 785    * to strpos. So quickly reject if substr isn't found there.
 786    * This works for \G too, because the caller will already have
 787    * subtracted gofs from pos, and gofs is the offset from the
 788    * \G to the start of the regex. For example, in /.abc\Gdef/,
 789    * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 790    * caller will have set strpos=pos()-4; we look for the substr
 791    * at position pos()-4+1, which lines up with the "a" */
 792
 793    if (prog->check_offset_min == prog->check_offset_max
 794     && !(prog->intflags & PREGf_CANY_SEEN))
 795    {
 796     /* Substring at constant offset from beg-of-str... */
 797     SSize_t slen = SvCUR(check);
 798     char *s = HOP3c(strpos, prog->check_offset_min, strend);
 799
 800     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 801      "  Looking for check substr at fixed offset %"IVdf"...\n",
 802      (IV)prog->check_offset_min));
 803
 804     if (SvTAIL(check)) {
 805      /* In this case, the regex is anchored at the end too.
 806      * Unless it's a multiline match, the lengths must match
 807      * exactly, give or take a \n.  NB: slen >= 1 since
 808      * the last char of check is \n */
 809      if (!multiline
 810       && (   strend - s > slen
 811        || strend - s < slen - 1
 812        || (strend - s == slen && strend[-1] != '\n')))
 813      {
 814       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 815            "  String too long...\n"));
 816       goto fail_finish;
 817      }
 818      /* Now should match s[0..slen-2] */
 819      slen--;
 820     }
 821     if (slen && (*SvPVX_const(check) != *s
 822      || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
 823     {
 824      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 825          "  String not equal...\n"));
 826      goto fail_finish;
 827     }
 828
 829     check_at = s;
 830     goto success_at_start;
 831    }
 832   }
 833  }
 834
 835  end_shift = prog->check_end_shift;
 836
 837 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
 838  if (end_shift < 0)
 839   Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 840     (IV)end_shift, RX_PRECOMP(prog));
 841 #endif
 842
 843   restart:
 844
 845  /* This is the (re)entry point of the main loop in this function.
 846  * The goal of this loop is to:
 847  * 1) find the "check" substring in the region rx_origin..strend
 848  *    (adjusted by start_shift / end_shift). If not found, reject
 849  *    immediately.
 850  * 2) If it exists, look for the "other" substr too if defined; for
 851  *    example, if the check substr maps to the anchored substr, then
 852  *    check the floating substr, and vice-versa. If not found, go
 853  *    back to (1) with rx_origin suitably incremented.
 854  * 3) If we find an rx_origin position that doesn't contradict
 855  *    either of the substrings, then check the possible additional
 856  *    constraints on rx_origin of /^.../m or a known start class.
 857  *    If these fail, then depending on which constraints fail, jump
 858  *    back to here, or to various other re-entry points further along
 859  *    that skip some of the first steps.
 860  * 4) If we pass all those tests, update the BmUSEFUL() count on the
 861  *    substring. If the start position was determined to be at the
 862  *    beginning of the string  - so, not rejected, but not optimised,
 863  *    since we have to run regmatch from position 0 - decrement the
 864  *    BmUSEFUL() count. Otherwise increment it.
 865  */
 866
 867
 868  /* first, look for the 'check' substring */
 869
 870  {
 871   U8* start_point;
 872   U8* end_point;
 873
 874   DEBUG_OPTIMISE_MORE_r({
 875    PerlIO_printf(Perl_debug_log,
 876     "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 877     " Start shift: %"IVdf" End shift %"IVdf
 878     " Real end Shift: %"IVdf"\n",
 879     (IV)(rx_origin - strpos),
 880     (IV)prog->check_offset_min,
 881     (IV)start_shift,
 882     (IV)end_shift,
 883     (IV)prog->check_end_shift);
 884   });
 885
 886   if (prog->intflags & PREGf_CANY_SEEN) {
 887    start_point= (U8*)(rx_origin + start_shift);
 888    end_point= (U8*)(strend - end_shift);
 889    if (start_point > end_point)
 890     goto fail_finish;
 891   } else {
 892    end_point = HOP3(strend, -end_shift, strbeg);
 893    start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 894    if (!start_point)
 895     goto fail_finish;
 896   }
 897
 898
 899   /* If the regex is absolutely anchored to either the start of the
 900   * string (BOL,SBOL) or to pos() (ANCH_GPOS), then
 901   * check_offset_max represents an upper bound on the string where
 902   * the substr could start. For the ANCH_GPOS case, we assume that
 903   * the caller of intuit will have already set strpos to
 904   * pos()-gofs, so in this case strpos + offset_max will still be
 905   * an upper bound on the substr.
 906   */
 907   if (!ml_anch
 908    && prog->intflags & PREGf_ANCH
 909    && prog->check_offset_max != SSize_t_MAX)
 910   {
 911    SSize_t len = SvCUR(check) - !!SvTAIL(check);
 912    const char * const anchor =
 913       (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 914
 915    /* do a bytes rather than chars comparison. It's conservative;
 916    * so it skips doing the HOP if the result can't possibly end
 917    * up earlier than the old value of end_point.
 918    */
 919    if ((char*)end_point - anchor > prog->check_offset_max) {
 920     end_point = HOP3lim((U8*)anchor,
 921         prog->check_offset_max,
 922         end_point -len)
 923        + len;
 924    }
 925   }
 926
 927   DEBUG_OPTIMISE_MORE_r({
 928    PerlIO_printf(Perl_debug_log, "  fbm_instr len=%d str=<%.*s>\n",
 929     (int)(end_point - start_point),
 930     (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 931     start_point);
 932   });
 933
 934   check_at = fbm_instr( start_point, end_point,
 935      check, multiline ? FBMrf_MULTILINE : 0);
 936
 937   /* Update the count-of-usability, remove useless subpatterns,
 938    unshift s.  */
 939
 940   DEBUG_EXECUTE_r({
 941    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 942     SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 943    PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s%s",
 944        (check_at ? "Found" : "Did not find"),
 945     (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 946      ? "anchored" : "floating"),
 947     quoted,
 948     RE_SV_TAIL(check),
 949     (check_at ? " at offset " : "...\n") );
 950   });
 951
 952   if (!check_at)
 953    goto fail_finish;
 954   /* Finish the diagnostic message */
 955   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
 956
 957   /* set rx_origin to the minimum position where the regex could start
 958   * matching, given the constraint of the just-matched check substring.
 959   * But don't set it lower than previously.
 960   */
 961
 962   if (check_at - rx_origin > prog->check_offset_max)
 963    rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 964  }
 965
 966
 967  /* now look for the 'other' substring if defined */
 968
 969  if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 970      : prog->substrs->data[other_ix].substr)
 971  {
 972   /* Take into account the "other" substring. */
 973   char *last, *last1;
 974   char *s;
 975   SV* must;
 976   struct reg_substr_datum *other;
 977
 978  do_other_substr:
 979   other = &prog->substrs->data[other_ix];
 980
 981   /* if "other" is anchored:
 982   * we've previously found a floating substr starting at check_at.
 983   * This means that the regex origin must lie somewhere
 984   * between min (rx_origin): HOP3(check_at, -check_offset_max)
 985   * and max:                 HOP3(check_at, -check_offset_min)
 986   * (except that min will be >= strpos)
 987   * So the fixed  substr must lie somewhere between
 988   *  HOP3(min, anchored_offset)
 989   *  HOP3(max, anchored_offset) + SvCUR(substr)
 990   */
 991
 992   /* if "other" is floating
 993   * Calculate last1, the absolute latest point where the
 994   * floating substr could start in the string, ignoring any
 995   * constraints from the earlier fixed match. It is calculated
 996   * as follows:
 997   *
 998   * strend - prog->minlen (in chars) is the absolute latest
 999   * position within the string where the origin of the regex
1000   * could appear. The latest start point for the floating
1001   * substr is float_min_offset(*) on from the start of the
1002   * regex.  last1 simply combines thee two offsets.
1003   *
1004   * (*) You might think the latest start point should be
1005   * float_max_offset from the regex origin, and technically
1006   * you'd be correct. However, consider
1007   *    /a\d{2,4}bcd\w/
1008   * Here, float min, max are 3,5 and minlen is 7.
1009   * This can match either
1010   *    /a\d\dbcd\w/
1011   *    /a\d\d\dbcd\w/
1012   *    /a\d\d\d\dbcd\w/
1013   * In the first case, the regex matches minlen chars; in the
1014   * second, minlen+1, in the third, minlen+2.
1015   * In the first case, the floating offset is 3 (which equals
1016   * float_min), in the second, 4, and in the third, 5 (which
1017   * equals float_max). In all cases, the floating string bcd
1018   * can never start more than 4 chars from the end of the
1019   * string, which equals minlen - float_min. As the substring
1020   * starts to match more than float_min from the start of the
1021   * regex, it makes the regex match more than minlen chars,
1022   * and the two cancel each other out. So we can always use
1023   * float_min - minlen, rather than float_max - minlen for the
1024   * latest position in the string.
1025   *
1026   * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1027   * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1028   */
1029
1030   assert(prog->minlen >= other->min_offset);
1031   last1 = HOP3c(strend,
1032       other->min_offset - prog->minlen, strbeg);
1033
1034   if (other_ix) {/* i.e. if (other-is-float) */
1035    /* last is the latest point where the floating substr could
1036    * start, *given* any constraints from the earlier fixed
1037    * match. This constraint is that the floating string starts
1038    * <= float_max_offset chars from the regex origin (rx_origin).
1039    * If this value is less than last1, use it instead.
1040    */
1041    assert(rx_origin <= last1);
1042    last =
1043     /* this condition handles the offset==infinity case, and
1044     * is a short-cut otherwise. Although it's comparing a
1045     * byte offset to a char length, it does so in a safe way,
1046     * since 1 char always occupies 1 or more bytes,
1047     * so if a string range is  (last1 - rx_origin) bytes,
1048     * it will be less than or equal to  (last1 - rx_origin)
1049     * chars; meaning it errs towards doing the accurate HOP3
1050     * rather than just using last1 as a short-cut */
1051     (last1 - rx_origin) < other->max_offset
1052      ? last1
1053      : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1054   }
1055   else {
1056    assert(strpos + start_shift <= check_at);
1057    last = HOP4c(check_at, other->min_offset - start_shift,
1058       strbeg, strend);
1059   }
1060
1061   s = HOP3c(rx_origin, other->min_offset, strend);
1062   if (s < other_last) /* These positions already checked */
1063    s = other_last;
1064
1065   must = utf8_target ? other->utf8_substr : other->substr;
1066   assert(SvPOK(must));
1067   s = fbm_instr(
1068    (unsigned char*)s,
1069    (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1070    must,
1071    multiline ? FBMrf_MULTILINE : 0
1072   );
1073   DEBUG_EXECUTE_r({
1074    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1075     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1076    PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s",
1077     s ? "Found" : "Contradicts",
1078     other_ix ? "floating" : "anchored",
1079     quoted, RE_SV_TAIL(must));
1080   });
1081
1082
1083   if (!s) {
1084    /* last1 is latest possible substr location. If we didn't
1085    * find it before there, we never will */
1086    if (last >= last1) {
1087     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1088           ", giving up...\n"));
1089     goto fail_finish;
1090    }
1091
1092    /* try to find the check substr again at a later
1093    * position. Maybe next time we'll find the "other" substr
1094    * in range too */
1095    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1096     ", trying %s at offset %ld...\n",
1097     (other_ix ? "floating" : "anchored"),
1098     (long)(HOP3c(check_at, 1, strend) - strpos)));
1099
1100    other_last = HOP3c(last, 1, strend) /* highest failure */;
1101    rx_origin =
1102     other_ix /* i.e. if other-is-float */
1103      ? HOP3c(rx_origin, 1, strend)
1104      : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1105    goto restart;
1106   }
1107   else {
1108    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
1109     (long)(s - strpos)));
1110
1111    if (other_ix) { /* if (other-is-float) */
1112     /* other_last is set to s, not s+1, since its possible for
1113     * a floating substr to fail first time, then succeed
1114     * second time at the same floating position; e.g.:
1115     *     "-AB--AABZ" =~ /\wAB\d*Z/
1116     * The first time round, anchored and float match at
1117     * "-(AB)--AAB(Z)" then fail on the initial \w character
1118     * class. Second time round, they match at "-AB--A(AB)(Z)".
1119     */
1120     other_last = s;
1121    }
1122    else {
1123     rx_origin = HOP3c(s, -other->min_offset, strbeg);
1124     other_last = HOP3c(s, 1, strend);
1125    }
1126   }
1127  }
1128  else {
1129   DEBUG_OPTIMISE_MORE_r(
1130    PerlIO_printf(Perl_debug_log,
1131     "  Check-only match: offset min:%"IVdf" max:%"IVdf
1132     " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1133     " strend-strpos:%"IVdf"\n",
1134     (IV)prog->check_offset_min,
1135     (IV)prog->check_offset_max,
1136     (IV)(check_at-strpos),
1137     (IV)(rx_origin-strpos),
1138     (IV)(rx_origin-check_at),
1139     (IV)(strend-strpos)
1140    )
1141   );
1142  }
1143
1144   postprocess_substr_matches:
1145
1146  /* handle the extra constraint of /^.../m if present */
1147
1148  if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1149   char *s;
1150
1151   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1152       "  looking for /^/m anchor"));
1153
1154   /* we have failed the constraint of a \n before rx_origin.
1155   * Find the next \n, if any, even if it's beyond the current
1156   * anchored and/or floating substrings. Whether we should be
1157   * scanning ahead for the next \n or the next substr is debatable.
1158   * On the one hand you'd expect rare substrings to appear less
1159   * often than \n's. On the other hand, searching for \n means
1160   * we're effectively flipping been check_substr and "\n" on each
1161   * iteration as the current "rarest" string candidate, which
1162   * means for example that we'll quickly reject the whole string if
1163   * hasn't got a \n, rather than trying every substr position
1164   * first
1165   */
1166
1167   s = HOP3c(strend, - prog->minlen, strpos);
1168   if (s <= rx_origin ||
1169    ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1170   {
1171    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1172        "  Did not find /%s^%s/m...\n",
1173        PL_colors[0], PL_colors[1]));
1174    goto fail_finish;
1175   }
1176
1177   /* earliest possible origin is 1 char after the \n.
1178   * (since *rx_origin == '\n', it's safe to ++ here rather than
1179   * HOP(rx_origin, 1)) */
1180   rx_origin++;
1181
1182   if (prog->substrs->check_ix == 0  /* check is anchored */
1183    || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1184   {
1185    /* Position contradicts check-string; either because
1186    * check was anchored (and thus has no wiggle room),
1187    * or check was float and rx_origin is above the float range */
1188    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1189     "  Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1190     PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1191    goto restart;
1192   }
1193
1194   /* if we get here, the check substr must have been float,
1195   * is in range, and we may or may not have had an anchored
1196   * "other" substr which still contradicts */
1197   assert(prog->substrs->check_ix); /* check is float */
1198
1199   if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1200    /* whoops, the anchored "other" substr exists, so we still
1201    * contradict. On the other hand, the float "check" substr
1202    * didn't contradict, so just retry the anchored "other"
1203    * substr */
1204    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1205     "  Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1206     PL_colors[0], PL_colors[1],
1207     (long)(rx_origin - strpos),
1208     (long)(rx_origin - strpos + prog->anchored_offset)));
1209    goto do_other_substr;
1210   }
1211
1212   /* success: we don't contradict the found floating substring
1213   * (and there's no anchored substr). */
1214   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1215    "  Found /%s^%s/m at offset %ld...\n",
1216    PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1217  }
1218  else {
1219   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1220    "  (multiline anchor test skipped)\n"));
1221  }
1222
1223   success_at_start:
1224
1225
1226  /* if we have a starting character class, then test that extra constraint.
1227  * (trie stclasses are too expensive to use here, we are better off to
1228  * leave it to regmatch itself) */
1229
1230  if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1231   const U8* const str = (U8*)STRING(progi->regstclass);
1232
1233   /* XXX this value could be pre-computed */
1234   const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1235      ?  (reginfo->is_utf8_pat
1236       ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1237       : STR_LEN(progi->regstclass))
1238      : 1);
1239   char * endpos;
1240   char *s;
1241   /* latest pos that a matching float substr constrains rx start to */
1242   char *rx_max_float = NULL;
1243
1244   /* if the current rx_origin is anchored, either by satisfying an
1245   * anchored substring constraint, or a /^.../m constraint, then we
1246   * can reject the current origin if the start class isn't found
1247   * at the current position. If we have a float-only match, then
1248   * rx_origin is constrained to a range; so look for the start class
1249   * in that range. if neither, then look for the start class in the
1250   * whole rest of the string */
1251
1252   /* XXX DAPM it's not clear what the minlen test is for, and why
1253   * it's not used in the floating case. Nothing in the test suite
1254   * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1255   * Here are some old comments, which may or may not be correct:
1256   *
1257   *   minlen == 0 is possible if regstclass is \b or \B,
1258   *   and the fixed substr is ''$.
1259   *   Since minlen is already taken into account, rx_origin+1 is
1260   *   before strend; accidentally, minlen >= 1 guaranties no false
1261   *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1262   *   0) below assumes that regstclass does not come from lookahead...
1263   *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1264   *   This leaves EXACTF-ish only, which are dealt with in
1265   *   find_byclass().
1266   */
1267
1268   if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1269    endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1270   else if (prog->float_substr || prog->float_utf8) {
1271    rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1272    endpos= HOP3c(rx_max_float, cl_l, strend);
1273   }
1274   else
1275    endpos= strend;
1276
1277   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1278    "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1279    " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1280    (IV)start_shift, (IV)(check_at - strbeg),
1281    (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1282
1283   s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1284        reginfo);
1285   if (!s) {
1286    if (endpos == strend) {
1287     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1288         "  Could not match STCLASS...\n") );
1289     goto fail;
1290    }
1291    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1292        "  This position contradicts STCLASS...\n") );
1293    if ((prog->intflags & PREGf_ANCH) && !ml_anch
1294       && !(prog->intflags & PREGf_IMPLICIT))
1295     goto fail;
1296
1297    /* Contradict one of substrings */
1298    if (prog->anchored_substr || prog->anchored_utf8) {
1299     if (prog->substrs->check_ix == 1) { /* check is float */
1300      /* Have both, check_string is floating */
1301      assert(rx_origin + start_shift <= check_at);
1302      if (rx_origin + start_shift != check_at) {
1303       /* not at latest position float substr could match:
1304       * Recheck anchored substring, but not floating.
1305       * The condition above is in bytes rather than
1306       * chars for efficiency. It's conservative, in
1307       * that it errs on the side of doing 'goto
1308       * do_other_substr', where a more accurate
1309       * char-based calculation will be done */
1310       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1311         "  Looking for anchored substr starting at offset %ld...\n",
1312         (long)(other_last - strpos)) );
1313       goto do_other_substr;
1314      }
1315     }
1316    }
1317    else {
1318     /* float-only */
1319
1320     if (ml_anch) {
1321      /* In the presence of ml_anch, we might be able to
1322      * find another \n without breaking the current float
1323      * constraint. */
1324
1325      /* strictly speaking this should be HOP3c(..., 1, ...),
1326      * but since we goto a block of code that's going to
1327      * search for the next \n if any, its safe here */
1328      rx_origin++;
1329      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1330        "  Looking for /%s^%s/m starting at offset %ld...\n",
1331        PL_colors[0], PL_colors[1],
1332        (long)(rx_origin - strpos)) );
1333      goto postprocess_substr_matches;
1334     }
1335
1336     /* strictly speaking this can never be true; but might
1337     * be if we ever allow intuit without substrings */
1338     if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1339      goto fail;
1340
1341     rx_origin = rx_max_float;
1342    }
1343
1344    /* at this point, any matching substrings have been
1345    * contradicted. Start again... */
1346
1347    rx_origin = HOP3c(rx_origin, 1, strend);
1348
1349    /* uses bytes rather than char calculations for efficiency.
1350    * It's conservative: it errs on the side of doing 'goto restart',
1351    * where there is code that does a proper char-based test */
1352    if (rx_origin + start_shift + end_shift > strend) {
1353     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1354          "  Could not match STCLASS...\n") );
1355     goto fail;
1356    }
1357    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1358     "  Looking for %s substr starting at offset %ld...\n",
1359     (prog->substrs->check_ix ? "floating" : "anchored"),
1360     (long)(rx_origin + start_shift - strpos)) );
1361    goto restart;
1362   }
1363
1364   /* Success !!! */
1365
1366   if (rx_origin != s) {
1367    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1368       "  By STCLASS: moving %ld --> %ld\n",
1369         (long)(rx_origin - strpos), (long)(s - strpos))
1370     );
1371   }
1372   else {
1373    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1374         "  Does not contradict STCLASS...\n");
1375     );
1376   }
1377  }
1378
1379  /* Decide whether using the substrings helped */
1380
1381  if (rx_origin != strpos) {
1382   /* Fixed substring is found far enough so that the match
1383   cannot start at strpos. */
1384
1385   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  try at offset...\n"));
1386   ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1387  }
1388  else {
1389   /* The found rx_origin position does not prohibit matching at
1390   * strpos, so calling intuit didn't gain us anything. Decrement
1391   * the BmUSEFUL() count on the check substring, and if we reach
1392   * zero, free it.  */
1393   if (!(prog->intflags & PREGf_NAUGHTY)
1394    && (utf8_target ? (
1395     prog->check_utf8  /* Could be deleted already */
1396     && --BmUSEFUL(prog->check_utf8) < 0
1397     && (prog->check_utf8 == prog->float_utf8)
1398    ) : (
1399     prog->check_substr  /* Could be deleted already */
1400     && --BmUSEFUL(prog->check_substr) < 0
1401     && (prog->check_substr == prog->float_substr)
1402    )))
1403   {
1404    /* If flags & SOMETHING - do not do it many times on the same match */
1405    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  ... Disabling check substring...\n"));
1406    /* XXX Does the destruction order has to change with utf8_target? */
1407    SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1408    SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1409    prog->check_substr = prog->check_utf8 = NULL; /* disable */
1410    prog->float_substr = prog->float_utf8 = NULL; /* clear */
1411    check = NULL;   /* abort */
1412    /* XXXX This is a remnant of the old implementation.  It
1413      looks wasteful, since now INTUIT can use many
1414      other heuristics. */
1415    prog->extflags &= ~RXf_USE_INTUIT;
1416   }
1417  }
1418
1419  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1420    "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1421    PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1422
1423  return rx_origin;
1424
1425   fail_finish:    /* Substring not found */
1426  if (prog->check_substr || prog->check_utf8)  /* could be removed already */
1427   BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1428   fail:
1429  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1430       PL_colors[4], PL_colors[5]));
1431  return NULL;
1432 }
1433
1434
1435 #define DECL_TRIE_TYPE(scan) \
1436  const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1437     trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
1438      trie_type = ((scan->flags == EXACT) \
1439        ? (utf8_target ? trie_utf8 : trie_plain) \
1440        : (scan->flags == EXACTFA) \
1441         ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1442         : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1443
1444 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1445 STMT_START {                                                                        \
1446  STRLEN skiplen;                                                                 \
1447  U8 flags = FOLD_FLAGS_FULL;                                                     \
1448  switch (trie_type) {                                                            \
1449  case trie_utf8_exactfa_fold:                                                    \
1450   flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1451   /* FALLTHROUGH */                                                          \
1452  case trie_utf8_fold:                                                            \
1453   if ( foldlen>0 ) {                                                          \
1454    uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1455    foldlen -= len;                                                         \
1456    uscan += len;                                                           \
1457    len=0;                                                                  \
1458   } else {                                                                    \
1459    uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1460    len = UTF8SKIP(uc);                                                     \
1461    skiplen = UNISKIP( uvc );                                               \
1462    foldlen -= skiplen;                                                     \
1463    uscan = foldbuf + skiplen;                                              \
1464   }                                                                           \
1465   break;                                                                      \
1466  case trie_latin_utf8_exactfa_fold:                                              \
1467   flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1468   /* FALLTHROUGH */                                                          \
1469  case trie_latin_utf8_fold:                                                      \
1470   if ( foldlen>0 ) {                                                          \
1471    uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1472    foldlen -= len;                                                         \
1473    uscan += len;                                                           \
1474    len=0;                                                                  \
1475   } else {                                                                    \
1476    len = 1;                                                                \
1477    uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1478    skiplen = UNISKIP( uvc );                                               \
1479    foldlen -= skiplen;                                                     \
1480    uscan = foldbuf + skiplen;                                              \
1481   }                                                                           \
1482   break;                                                                      \
1483  case trie_utf8:                                                                 \
1484   uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1485   break;                                                                      \
1486  case trie_plain:                                                                \
1487   uvc = (UV)*uc;                                                              \
1488   len = 1;                                                                    \
1489  }                                                                               \
1490  if (uvc < 256) {                                                                \
1491   charid = trie->charmap[ uvc ];                                              \
1492  }                                                                               \
1493  else {                                                                          \
1494   charid = 0;                                                                 \
1495   if (widecharmap) {                                                          \
1496    SV** const svpp = hv_fetch(widecharmap,                                 \
1497       (char*)&uvc, sizeof(UV), 0);                                \
1498    if (svpp)                                                               \
1499     charid = (U16)SvIV(*svpp);                                          \
1500   }                                                                           \
1501  }                                                                               \
1502 } STMT_END
1503
1504 #define DUMP_EXEC_POS(li,s,doutf8)                          \
1505  dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1506     startpos, doutf8)
1507
1508 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1509 STMT_START {                                              \
1510  while (s <= e) {                                      \
1511   if ( (COND)                                       \
1512    && (ln == 1 || folder(s, pat_string, ln))    \
1513    && (reginfo->intuit || regtry(reginfo, &s)) )\
1514    goto got_it;                                  \
1515   s++;                                              \
1516  }                                                     \
1517 } STMT_END
1518
1519 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1520 STMT_START {                                          \
1521  while (s < strend) {                              \
1522   CODE                                          \
1523   s += UTF8SKIP(s);                             \
1524  }                                                 \
1525 } STMT_END
1526
1527 #define REXEC_FBC_SCAN(CODE)                          \
1528 STMT_START {                                          \
1529  while (s < strend) {                              \
1530   CODE                                          \
1531   s++;                                          \
1532  }                                                 \
1533 } STMT_END
1534
1535 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1536 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1537  if (COND) {                                                \
1538   if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1539    goto got_it;                                       \
1540   else                                                   \
1541    tmp = doevery;                                     \
1542  }                                                          \
1543  else                                                       \
1544   tmp = 1;                                               \
1545 )
1546
1547 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1548 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1549  if (COND) {                                                \
1550   if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1551    goto got_it;                                       \
1552   else                                                   \
1553    tmp = doevery;                                     \
1554  }                                                          \
1555  else                                                       \
1556   tmp = 1;                                               \
1557 )
1558
1559 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1560  if (utf8_target) {                                         \
1561   REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1562  }                                                          \
1563  else {                                                     \
1564   REXEC_FBC_CLASS_SCAN(COND);                            \
1565  }
1566
1567 /* The three macros below are slightly different versions of the same logic.
1568  *
1569  * The first is for /a and /aa when the target string is UTF-8.  This can only
1570  * match ascii, but it must advance based on UTF-8.   The other two handle the
1571  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1572  * for the boundary (or non-boundary) between a word and non-word character.
1573  * The utf8 and non-utf8 cases have the same logic, but the details must be
1574  * different.  Find the "wordness" of the character just prior to this one, and
1575  * compare it with the wordness of this one.  If they differ, we have a
1576  * boundary.  At the beginning of the string, pretend that the previous
1577  * character was a new-line.
1578  *
1579  * All these macros uncleanly have side-effects with each other and outside
1580  * variables.  So far it's been too much trouble to clean-up
1581  *
1582  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1583  *               a word character or not.
1584  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1585  *               word/non-word
1586  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1587  *
1588  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1589  * are looking for a boundary or for a non-boundary.  If we are looking for a
1590  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1591  * see if this tentative match actually works, and if so, to quit the loop
1592  * here.  And vice-versa if we are looking for a non-boundary.
1593  *
1594  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1595  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1596  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1597  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1598  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1599  * complement.  But in that branch we complement tmp, meaning that at the
1600  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1601  * which means at the top of the loop in the next iteration, it is
1602  * TEST_NON_UTF8(s-1) */
1603 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1604  tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1605  tmp = TEST_NON_UTF8(tmp);                                                  \
1606  REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1607   if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1608    tmp = !tmp;                                                        \
1609    IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1610   }                                                                      \
1611   else {                                                                 \
1612    IF_FAIL;                                                           \
1613   }                                                                      \
1614  );                                                                         \
1615
1616 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1617  * TEST_UTF8 is a macro that for the same input code points returns identically
1618  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1619 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1620  if (s == reginfo->strbeg) {                                                \
1621   tmp = '\n';                                                            \
1622  }                                                                          \
1623  else { /* Back-up to the start of the previous character */                \
1624   U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1625   tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1626              0, UTF8_ALLOW_DEFAULT); \
1627  }                                                                          \
1628  tmp = TEST_UV(tmp);                                                        \
1629  LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1630  REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1631   if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
1632    tmp = !tmp;                                                        \
1633    IF_SUCCESS;                                                        \
1634   }                                                                      \
1635   else {                                                                 \
1636    IF_FAIL;                                                           \
1637   }                                                                      \
1638  );
1639
1640 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1641  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1642  * macros below */
1643 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1644  if (utf8_target) {                                                         \
1645   UTF8_CODE                                                              \
1646  }                                                                          \
1647  else {  /* Not utf8 */                                                     \
1648   tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1649   tmp = TEST_NON_UTF8(tmp);                                              \
1650   REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1651    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1652     IF_SUCCESS;                                                    \
1653     tmp = !tmp;                                                    \
1654    }                                                                  \
1655    else {                                                             \
1656     IF_FAIL;                                                       \
1657    }                                                                  \
1658   );                                                                     \
1659  }                                                                          \
1660  /* Here, things have been set up by the previous code so that tmp is the   \
1661  * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1662  * utf8ness of the target).  We also have to check if this matches against \
1663  * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1664  * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1665  * string */                                                               \
1666  if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1667   IF_SUCCESS;                                                            \
1668  }                                                                          \
1669  else {                                                                     \
1670   IF_FAIL;                                                               \
1671  }
1672
1673 /* This is the macro to use when we want to see if something that looks like it
1674  * could match, actually does, and if so exits the loop */
1675 #define REXEC_FBC_TRYIT                            \
1676  if ((reginfo->intuit || regtry(reginfo, &s)))  \
1677   goto got_it
1678
1679 /* The only difference between the BOUND and NBOUND cases is that
1680  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1681  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1682  * with the other one being empty (PLACEHOLDER is defined as empty).
1683  *
1684  * The TEST_FOO parameters are for operating on different forms of input, but
1685  * all should be ones that return identically for the same underlying code
1686  * points */
1687 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1688  FBC_BOUND_COMMON(                                                          \
1689   FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1690   TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1691
1692 #define FBC_BOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                         \
1693  FBC_BOUND_COMMON(                                                          \
1694    FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1695    TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1696
1697 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1698  FBC_BOUND_COMMON(                                                          \
1699   FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1700   TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1701
1702 #define FBC_NBOUND_A(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                        \
1703  FBC_BOUND_COMMON(                                                          \
1704    FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1705    TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1706
1707
1708 /* We know what class REx starts with.  Try to find this position... */
1709 /* if reginfo->intuit, its a dryrun */
1710 /* annoyingly all the vars in this routine have different names from their counterparts
1711    in regmatch. /grrr */
1712 STATIC char *
1713 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1714  const char *strend, regmatch_info *reginfo)
1715 {
1716  dVAR;
1717  const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1718  char *pat_string;   /* The pattern's exactish string */
1719  char *pat_end;     /* ptr to end char of pat_string */
1720  re_fold_t folder; /* Function for computing non-utf8 folds */
1721  const U8 *fold_array;   /* array for folding ords < 256 */
1722  STRLEN ln;
1723  STRLEN lnc;
1724  U8 c1;
1725  U8 c2;
1726  char *e;
1727  I32 tmp = 1; /* Scratch variable? */
1728  const bool utf8_target = reginfo->is_utf8_target;
1729  UV utf8_fold_flags = 0;
1730  const bool is_utf8_pat = reginfo->is_utf8_pat;
1731  bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1732         with a result inverts that result, as 0^1 =
1733         1 and 1^1 = 0 */
1734  _char_class_number classnum;
1735
1736  RXi_GET_DECL(prog,progi);
1737
1738  PERL_ARGS_ASSERT_FIND_BYCLASS;
1739
1740  /* We know what class it must start with. */
1741  switch (OP(c)) {
1742  case ANYOF:
1743   if (utf8_target) {
1744    REXEC_FBC_UTF8_CLASS_SCAN(
1745      reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1746   }
1747   else {
1748    REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1749   }
1750   break;
1751  case CANY:
1752   REXEC_FBC_SCAN(
1753    if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1754     goto got_it;
1755    else
1756     tmp = doevery;
1757   );
1758   break;
1759
1760  case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1761   assert(! is_utf8_pat);
1762   /* FALLTHROUGH */
1763  case EXACTFA:
1764   if (is_utf8_pat || utf8_target) {
1765    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1766    goto do_exactf_utf8;
1767   }
1768   fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1769   folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1770   goto do_exactf_non_utf8; /* isn't dealt with by these */
1771
1772  case EXACTF:   /* This node only generated for non-utf8 patterns */
1773   assert(! is_utf8_pat);
1774   if (utf8_target) {
1775    utf8_fold_flags = 0;
1776    goto do_exactf_utf8;
1777   }
1778   fold_array = PL_fold;
1779   folder = foldEQ;
1780   goto do_exactf_non_utf8;
1781
1782  case EXACTFL:
1783   if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1784    utf8_fold_flags = FOLDEQ_LOCALE;
1785    goto do_exactf_utf8;
1786   }
1787   fold_array = PL_fold_locale;
1788   folder = foldEQ_locale;
1789   goto do_exactf_non_utf8;
1790
1791  case EXACTFU_SS:
1792   if (is_utf8_pat) {
1793    utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1794   }
1795   goto do_exactf_utf8;
1796
1797  case EXACTFU:
1798   if (is_utf8_pat || utf8_target) {
1799    utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1800    goto do_exactf_utf8;
1801   }
1802
1803   /* Any 'ss' in the pattern should have been replaced by regcomp,
1804   * so we don't have to worry here about this single special case
1805   * in the Latin1 range */
1806   fold_array = PL_fold_latin1;
1807   folder = foldEQ_latin1;
1808
1809   /* FALLTHROUGH */
1810
1811  do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1812       are no glitches with fold-length differences
1813       between the target string and pattern */
1814
1815   /* The idea in the non-utf8 EXACTF* cases is to first find the
1816   * first character of the EXACTF* node and then, if necessary,
1817   * case-insensitively compare the full text of the node.  c1 is the
1818   * first character.  c2 is its fold.  This logic will not work for
1819   * Unicode semantics and the german sharp ss, which hence should
1820   * not be compiled into a node that gets here. */
1821   pat_string = STRING(c);
1822   ln  = STR_LEN(c); /* length to match in octets/bytes */
1823
1824   /* We know that we have to match at least 'ln' bytes (which is the
1825   * same as characters, since not utf8).  If we have to match 3
1826   * characters, and there are only 2 availabe, we know without
1827   * trying that it will fail; so don't start a match past the
1828   * required minimum number from the far end */
1829   e = HOP3c(strend, -((SSize_t)ln), s);
1830
1831   if (reginfo->intuit && e < s) {
1832    e = s;   /* Due to minlen logic of intuit() */
1833   }
1834
1835   c1 = *pat_string;
1836   c2 = fold_array[c1];
1837   if (c1 == c2) { /* If char and fold are the same */
1838    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1839   }
1840   else {
1841    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1842   }
1843   break;
1844
1845  do_exactf_utf8:
1846  {
1847   unsigned expansion;
1848
1849   /* If one of the operands is in utf8, we can't use the simpler folding
1850   * above, due to the fact that many different characters can have the
1851   * same fold, or portion of a fold, or different- length fold */
1852   pat_string = STRING(c);
1853   ln  = STR_LEN(c); /* length to match in octets/bytes */
1854   pat_end = pat_string + ln;
1855   lnc = is_utf8_pat       /* length to match in characters */
1856     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1857     : ln;
1858
1859   /* We have 'lnc' characters to match in the pattern, but because of
1860   * multi-character folding, each character in the target can match
1861   * up to 3 characters (Unicode guarantees it will never exceed
1862   * this) if it is utf8-encoded; and up to 2 if not (based on the
1863   * fact that the Latin 1 folds are already determined, and the
1864   * only multi-char fold in that range is the sharp-s folding to
1865   * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1866   * string character.  Adjust lnc accordingly, rounding up, so that
1867   * if we need to match at least 4+1/3 chars, that really is 5. */
1868   expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1869   lnc = (lnc + expansion - 1) / expansion;
1870
1871   /* As in the non-UTF8 case, if we have to match 3 characters, and
1872   * only 2 are left, it's guaranteed to fail, so don't start a
1873   * match that would require us to go beyond the end of the string
1874   */
1875   e = HOP3c(strend, -((SSize_t)lnc), s);
1876
1877   if (reginfo->intuit && e < s) {
1878    e = s;   /* Due to minlen logic of intuit() */
1879   }
1880
1881   /* XXX Note that we could recalculate e to stop the loop earlier,
1882   * as the worst case expansion above will rarely be met, and as we
1883   * go along we would usually find that e moves further to the left.
1884   * This would happen only after we reached the point in the loop
1885   * where if there were no expansion we should fail.  Unclear if
1886   * worth the expense */
1887
1888   while (s <= e) {
1889    char *my_strend= (char *)strend;
1890    if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1891     pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1892     && (reginfo->intuit || regtry(reginfo, &s)) )
1893    {
1894     goto got_it;
1895    }
1896    s += (utf8_target) ? UTF8SKIP(s) : 1;
1897   }
1898   break;
1899  }
1900
1901  case BOUNDL:
1902   FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1903   break;
1904  case NBOUNDL:
1905   FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
1906   break;
1907  case BOUND:
1908   FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
1909   break;
1910  case BOUNDA:
1911   FBC_BOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
1912   break;
1913  case NBOUND:
1914   FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
1915   break;
1916  case NBOUNDA:
1917   FBC_NBOUND_A(isWORDCHAR_A, isWORDCHAR_A, isWORDCHAR_A);
1918   break;
1919  case BOUNDU:
1920   FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
1921   break;
1922  case NBOUNDU:
1923   FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
1924   break;
1925  case LNBREAK:
1926   REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1927       is_LNBREAK_latin1_safe(s, strend)
1928   );
1929   break;
1930
1931  /* The argument to all the POSIX node types is the class number to pass to
1932  * _generic_isCC() to build a mask for searching in PL_charclass[] */
1933
1934  case NPOSIXL:
1935   to_complement = 1;
1936   /* FALLTHROUGH */
1937
1938  case POSIXL:
1939   REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1940       to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1941   break;
1942
1943  case NPOSIXD:
1944   to_complement = 1;
1945   /* FALLTHROUGH */
1946
1947  case POSIXD:
1948   if (utf8_target) {
1949    goto posix_utf8;
1950   }
1951   goto posixa;
1952
1953  case NPOSIXA:
1954   if (utf8_target) {
1955    /* The complement of something that matches only ASCII matches all
1956    * non-ASCII, plus everything in ASCII that isn't in the class. */
1957    REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
1958          || ! _generic_isCC_A(*s, FLAGS(c)));
1959    break;
1960   }
1961
1962   to_complement = 1;
1963   /* FALLTHROUGH */
1964
1965  case POSIXA:
1966  posixa:
1967   /* Don't need to worry about utf8, as it can match only a single
1968   * byte invariant character. */
1969   REXEC_FBC_CLASS_SCAN(
1970       to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1971   break;
1972
1973  case NPOSIXU:
1974   to_complement = 1;
1975   /* FALLTHROUGH */
1976
1977  case POSIXU:
1978   if (! utf8_target) {
1979    REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1980                  FLAGS(c))));
1981   }
1982   else {
1983
1984  posix_utf8:
1985    classnum = (_char_class_number) FLAGS(c);
1986    if (classnum < _FIRST_NON_SWASH_CC) {
1987     while (s < strend) {
1988
1989      /* We avoid loading in the swash as long as possible, but
1990      * should we have to, we jump to a separate loop.  This
1991      * extra 'if' statement is what keeps this code from being
1992      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1993      if (UTF8_IS_ABOVE_LATIN1(*s)) {
1994       goto found_above_latin1;
1995      }
1996      if ((UTF8_IS_INVARIANT(*s)
1997       && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1998                 classnum)))
1999       || (UTF8_IS_DOWNGRADEABLE_START(*s)
2000        && to_complement ^ cBOOL(
2001         _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
2002                  *(s + 1)),
2003            classnum))))
2004      {
2005       if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2006        goto got_it;
2007       else {
2008        tmp = doevery;
2009       }
2010      }
2011      else {
2012       tmp = 1;
2013      }
2014      s += UTF8SKIP(s);
2015     }
2016    }
2017    else switch (classnum) {    /* These classes are implemented as
2018           macros */
2019     case _CC_ENUM_SPACE: /* XXX would require separate code if we
2020           revert the change of \v matching this */
2021      /* FALLTHROUGH */
2022
2023     case _CC_ENUM_PSXSPC:
2024      REXEC_FBC_UTF8_CLASS_SCAN(
2025           to_complement ^ cBOOL(isSPACE_utf8(s)));
2026      break;
2027
2028     case _CC_ENUM_BLANK:
2029      REXEC_FBC_UTF8_CLASS_SCAN(
2030           to_complement ^ cBOOL(isBLANK_utf8(s)));
2031      break;
2032
2033     case _CC_ENUM_XDIGIT:
2034      REXEC_FBC_UTF8_CLASS_SCAN(
2035          to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2036      break;
2037
2038     case _CC_ENUM_VERTSPACE:
2039      REXEC_FBC_UTF8_CLASS_SCAN(
2040          to_complement ^ cBOOL(isVERTWS_utf8(s)));
2041      break;
2042
2043     case _CC_ENUM_CNTRL:
2044      REXEC_FBC_UTF8_CLASS_SCAN(
2045           to_complement ^ cBOOL(isCNTRL_utf8(s)));
2046      break;
2047
2048     default:
2049      Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2050      assert(0); /* NOTREACHED */
2051    }
2052   }
2053   break;
2054
2055  found_above_latin1:   /* Here we have to load a swash to get the result
2056        for the current code point */
2057   if (! PL_utf8_swash_ptrs[classnum]) {
2058    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2059    PL_utf8_swash_ptrs[classnum] =
2060      _core_swash_init("utf8",
2061          "",
2062          &PL_sv_undef, 1, 0,
2063          PL_XPosix_ptrs[classnum], &flags);
2064   }
2065
2066   /* This is a copy of the loop above for swash classes, though using the
2067   * FBC macro instead of being expanded out.  Since we've loaded the
2068   * swash, we don't have to check for that each time through the loop */
2069   REXEC_FBC_UTF8_CLASS_SCAN(
2070     to_complement ^ cBOOL(_generic_utf8(
2071          classnum,
2072          s,
2073          swash_fetch(PL_utf8_swash_ptrs[classnum],
2074             (U8 *) s, TRUE))));
2075   break;
2076
2077  case AHOCORASICKC:
2078  case AHOCORASICK:
2079   {
2080    DECL_TRIE_TYPE(c);
2081    /* what trie are we using right now */
2082    reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2083    reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2084    HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2085
2086    const char *last_start = strend - trie->minlen;
2087 #ifdef DEBUGGING
2088    const char *real_start = s;
2089 #endif
2090    STRLEN maxlen = trie->maxlen;
2091    SV *sv_points;
2092    U8 **points; /* map of where we were in the input string
2093        when reading a given char. For ASCII this
2094        is unnecessary overhead as the relationship
2095        is always 1:1, but for Unicode, especially
2096        case folded Unicode this is not true. */
2097    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2098    U8 *bitmap=NULL;
2099
2100
2101    GET_RE_DEBUG_FLAGS_DECL;
2102
2103    /* We can't just allocate points here. We need to wrap it in
2104    * an SV so it gets freed properly if there is a croak while
2105    * running the match */
2106    ENTER;
2107    SAVETMPS;
2108    sv_points=newSV(maxlen * sizeof(U8 *));
2109    SvCUR_set(sv_points,
2110     maxlen * sizeof(U8 *));
2111    SvPOK_on(sv_points);
2112    sv_2mortal(sv_points);
2113    points=(U8**)SvPV_nolen(sv_points );
2114    if ( trie_type != trie_utf8_fold
2115     && (trie->bitmap || OP(c)==AHOCORASICKC) )
2116    {
2117     if (trie->bitmap)
2118      bitmap=(U8*)trie->bitmap;
2119     else
2120      bitmap=(U8*)ANYOF_BITMAP(c);
2121    }
2122    /* this is the Aho-Corasick algorithm modified a touch
2123    to include special handling for long "unknown char" sequences.
2124    The basic idea being that we use AC as long as we are dealing
2125    with a possible matching char, when we encounter an unknown char
2126    (and we have not encountered an accepting state) we scan forward
2127    until we find a legal starting char.
2128    AC matching is basically that of trie matching, except that when
2129    we encounter a failing transition, we fall back to the current
2130    states "fail state", and try the current char again, a process
2131    we repeat until we reach the root state, state 1, or a legal
2132    transition. If we fail on the root state then we can either
2133    terminate if we have reached an accepting state previously, or
2134    restart the entire process from the beginning if we have not.
2135
2136    */
2137    while (s <= last_start) {
2138     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2139     U8 *uc = (U8*)s;
2140     U16 charid = 0;
2141     U32 base = 1;
2142     U32 state = 1;
2143     UV uvc = 0;
2144     STRLEN len = 0;
2145     STRLEN foldlen = 0;
2146     U8 *uscan = (U8*)NULL;
2147     U8 *leftmost = NULL;
2148 #ifdef DEBUGGING
2149     U32 accepted_word= 0;
2150 #endif
2151     U32 pointpos = 0;
2152
2153     while ( state && uc <= (U8*)strend ) {
2154      int failed=0;
2155      U32 word = aho->states[ state ].wordnum;
2156
2157      if( state==1 ) {
2158       if ( bitmap ) {
2159        DEBUG_TRIE_EXECUTE_r(
2160         if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2161          dump_exec_pos( (char *)uc, c, strend, real_start,
2162           (char *)uc, utf8_target );
2163          PerlIO_printf( Perl_debug_log,
2164           " Scanning for legal start char...\n");
2165         }
2166        );
2167        if (utf8_target) {
2168         while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2169          uc += UTF8SKIP(uc);
2170         }
2171        } else {
2172         while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2173          uc++;
2174         }
2175        }
2176        s= (char *)uc;
2177       }
2178       if (uc >(U8*)last_start) break;
2179      }
2180
2181      if ( word ) {
2182       U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2183       if (!leftmost || lpos < leftmost) {
2184        DEBUG_r(accepted_word=word);
2185        leftmost= lpos;
2186       }
2187       if (base==0) break;
2188
2189      }
2190      points[pointpos++ % maxlen]= uc;
2191      if (foldlen || uc < (U8*)strend) {
2192       REXEC_TRIE_READ_CHAR(trie_type, trie,
2193           widecharmap, uc,
2194           uscan, len, uvc, charid, foldlen,
2195           foldbuf, uniflags);
2196       DEBUG_TRIE_EXECUTE_r({
2197        dump_exec_pos( (char *)uc, c, strend,
2198           real_start, s, utf8_target);
2199        PerlIO_printf(Perl_debug_log,
2200         " Charid:%3u CP:%4"UVxf" ",
2201         charid, uvc);
2202       });
2203      }
2204      else {
2205       len = 0;
2206       charid = 0;
2207      }
2208
2209
2210      do {
2211 #ifdef DEBUGGING
2212       word = aho->states[ state ].wordnum;
2213 #endif
2214       base = aho->states[ state ].trans.base;
2215
2216       DEBUG_TRIE_EXECUTE_r({
2217        if (failed)
2218         dump_exec_pos( (char *)uc, c, strend, real_start,
2219          s,   utf8_target );
2220        PerlIO_printf( Perl_debug_log,
2221         "%sState: %4"UVxf", word=%"UVxf,
2222         failed ? " Fail transition to " : "",
2223         (UV)state, (UV)word);
2224       });
2225       if ( base ) {
2226        U32 tmp;
2227        I32 offset;
2228        if (charid &&
2229         ( ((offset = base + charid
2230          - 1 - trie->uniquecharcount)) >= 0)
2231         && ((U32)offset < trie->lasttrans)
2232         && trie->trans[offset].check == state
2233         && (tmp=trie->trans[offset].next))
2234        {
2235         DEBUG_TRIE_EXECUTE_r(
2236          PerlIO_printf( Perl_debug_log," - legal\n"));
2237         state = tmp;
2238         break;
2239        }
2240        else {
2241         DEBUG_TRIE_EXECUTE_r(
2242          PerlIO_printf( Perl_debug_log," - fail\n"));
2243         failed = 1;
2244         state = aho->fail[state];
2245        }
2246       }
2247       else {
2248        /* we must be accepting here */
2249        DEBUG_TRIE_EXECUTE_r(
2250          PerlIO_printf( Perl_debug_log," - accepting\n"));
2251        failed = 1;
2252        break;
2253       }
2254      } while(state);
2255      uc += len;
2256      if (failed) {
2257       if (leftmost)
2258        break;
2259       if (!state) state = 1;
2260      }
2261     }
2262     if ( aho->states[ state ].wordnum ) {
2263      U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2264      if (!leftmost || lpos < leftmost) {
2265       DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2266       leftmost = lpos;
2267      }
2268     }
2269     if (leftmost) {
2270      s = (char*)leftmost;
2271      DEBUG_TRIE_EXECUTE_r({
2272       PerlIO_printf(
2273        Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2274        (UV)accepted_word, (IV)(s - real_start)
2275       );
2276      });
2277      if (reginfo->intuit || regtry(reginfo, &s)) {
2278       FREETMPS;
2279       LEAVE;
2280       goto got_it;
2281      }
2282      s = HOPc(s,1);
2283      DEBUG_TRIE_EXECUTE_r({
2284       PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2285      });
2286     } else {
2287      DEBUG_TRIE_EXECUTE_r(
2288       PerlIO_printf( Perl_debug_log,"No match.\n"));
2289      break;
2290     }
2291    }
2292    FREETMPS;
2293    LEAVE;
2294   }
2295   break;
2296  default:
2297   Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2298  }
2299  return 0;
2300   got_it:
2301  return s;
2302 }
2303
2304 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2305  * flags have same meanings as with regexec_flags() */
2306
2307 static void
2308 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2309        char *strbeg,
2310        char *strend,
2311        SV *sv,
2312        U32 flags,
2313        bool utf8_target)
2314 {
2315  struct regexp *const prog = ReANY(rx);
2316
2317  if (flags & REXEC_COPY_STR) {
2318 #ifdef PERL_ANY_COW
2319   if (SvCANCOW(sv)) {
2320    if (DEBUG_C_TEST) {
2321     PerlIO_printf(Perl_debug_log,
2322        "Copy on write: regexp capture, type %d\n",
2323        (int) SvTYPE(sv));
2324    }
2325    /* Create a new COW SV to share the match string and store
2326    * in saved_copy, unless the current COW SV in saved_copy
2327    * is valid and suitable for our purpose */
2328    if ((   prog->saved_copy
2329     && SvIsCOW(prog->saved_copy)
2330     && SvPOKp(prog->saved_copy)
2331     && SvIsCOW(sv)
2332     && SvPOKp(sv)
2333     && SvPVX(sv) == SvPVX(prog->saved_copy)))
2334    {
2335     /* just reuse saved_copy SV */
2336     if (RXp_MATCH_COPIED(prog)) {
2337      Safefree(prog->subbeg);
2338      RXp_MATCH_COPIED_off(prog);
2339     }
2340    }
2341    else {
2342     /* create new COW SV to share string */
2343     RX_MATCH_COPY_FREE(rx);
2344     prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2345    }
2346    prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2347    assert (SvPOKp(prog->saved_copy));
2348    prog->sublen  = strend - strbeg;
2349    prog->suboffset = 0;
2350    prog->subcoffset = 0;
2351   } else
2352 #endif
2353   {
2354    SSize_t min = 0;
2355    SSize_t max = strend - strbeg;
2356    SSize_t sublen;
2357
2358    if (    (flags & REXEC_COPY_SKIP_POST)
2359     && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2360     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2361    ) { /* don't copy $' part of string */
2362     U32 n = 0;
2363     max = -1;
2364     /* calculate the right-most part of the string covered
2365     * by a capture. Due to look-ahead, this may be to
2366     * the right of $&, so we have to scan all captures */
2367     while (n <= prog->lastparen) {
2368      if (prog->offs[n].end > max)
2369       max = prog->offs[n].end;
2370      n++;
2371     }
2372     if (max == -1)
2373      max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2374        ? prog->offs[0].start
2375        : 0;
2376     assert(max >= 0 && max <= strend - strbeg);
2377    }
2378
2379    if (    (flags & REXEC_COPY_SKIP_PRE)
2380     && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2381     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2382    ) { /* don't copy $` part of string */
2383     U32 n = 0;
2384     min = max;
2385     /* calculate the left-most part of the string covered
2386     * by a capture. Due to look-behind, this may be to
2387     * the left of $&, so we have to scan all captures */
2388     while (min && n <= prog->lastparen) {
2389      if (   prog->offs[n].start != -1
2390       && prog->offs[n].start < min)
2391      {
2392       min = prog->offs[n].start;
2393      }
2394      n++;
2395     }
2396     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2397      && min >  prog->offs[0].end
2398     )
2399      min = prog->offs[0].end;
2400
2401    }
2402
2403    assert(min >= 0 && min <= max && min <= strend - strbeg);
2404    sublen = max - min;
2405
2406    if (RX_MATCH_COPIED(rx)) {
2407     if (sublen > prog->sublen)
2408      prog->subbeg =
2409        (char*)saferealloc(prog->subbeg, sublen+1);
2410    }
2411    else
2412     prog->subbeg = (char*)safemalloc(sublen+1);
2413    Copy(strbeg + min, prog->subbeg, sublen, char);
2414    prog->subbeg[sublen] = '\0';
2415    prog->suboffset = min;
2416    prog->sublen = sublen;
2417    RX_MATCH_COPIED_on(rx);
2418   }
2419   prog->subcoffset = prog->suboffset;
2420   if (prog->suboffset && utf8_target) {
2421    /* Convert byte offset to chars.
2422    * XXX ideally should only compute this if @-/@+
2423    * has been seen, a la PL_sawampersand ??? */
2424
2425    /* If there's a direct correspondence between the
2426    * string which we're matching and the original SV,
2427    * then we can use the utf8 len cache associated with
2428    * the SV. In particular, it means that under //g,
2429    * sv_pos_b2u() will use the previously cached
2430    * position to speed up working out the new length of
2431    * subcoffset, rather than counting from the start of
2432    * the string each time. This stops
2433    *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2434    * from going quadratic */
2435    if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2436     prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2437             SV_GMAGIC|SV_CONST_RETURN);
2438    else
2439     prog->subcoffset = utf8_length((U8*)strbeg,
2440          (U8*)(strbeg+prog->suboffset));
2441   }
2442  }
2443  else {
2444   RX_MATCH_COPY_FREE(rx);
2445   prog->subbeg = strbeg;
2446   prog->suboffset = 0;
2447   prog->subcoffset = 0;
2448   prog->sublen = strend - strbeg;
2449  }
2450 }
2451
2452
2453
2454
2455 /*
2456  - regexec_flags - match a regexp against a string
2457  */
2458 I32
2459 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2460    char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2461 /* stringarg: the point in the string at which to begin matching */
2462 /* strend:    pointer to null at end of string */
2463 /* strbeg:    real beginning of string */
2464 /* minend:    end of match must be >= minend bytes after stringarg. */
2465 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2466  *            itself is accessed via the pointers above */
2467 /* data:      May be used for some additional optimizations.
2468    Currently unused. */
2469 /* flags:     For optimizations. See REXEC_* in regexp.h */
2470
2471 {
2472  struct regexp *const prog = ReANY(rx);
2473  char *s;
2474  regnode *c;
2475  char *startpos;
2476  SSize_t minlen;  /* must match at least this many chars */
2477  SSize_t dontbother = 0; /* how many characters not to try at end */
2478  const bool utf8_target = cBOOL(DO_UTF8(sv));
2479  I32 multiline;
2480  RXi_GET_DECL(prog,progi);
2481  regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2482  regmatch_info *const reginfo = &reginfo_buf;
2483  regexp_paren_pair *swap = NULL;
2484  I32 oldsave;
2485  GET_RE_DEBUG_FLAGS_DECL;
2486
2487  PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2488  PERL_UNUSED_ARG(data);
2489
2490  /* Be paranoid... */
2491  if (prog == NULL || stringarg == NULL) {
2492   Perl_croak(aTHX_ "NULL regexp parameter");
2493  }
2494
2495  DEBUG_EXECUTE_r(
2496   debug_start_match(rx, utf8_target, stringarg, strend,
2497   "Matching");
2498  );
2499
2500  startpos = stringarg;
2501
2502  if (prog->intflags & PREGf_GPOS_SEEN) {
2503   MAGIC *mg;
2504
2505   /* set reginfo->ganch, the position where \G can match */
2506
2507   reginfo->ganch =
2508    (flags & REXEC_IGNOREPOS)
2509    ? stringarg /* use start pos rather than pos() */
2510    : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2511    /* Defined pos(): */
2512    ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2513    : strbeg; /* pos() not defined; use start of string */
2514
2515   DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2516    "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2517
2518   /* in the presence of \G, we may need to start looking earlier in
2519   * the string than the suggested start point of stringarg:
2520   * if prog->gofs is set, then that's a known, fixed minimum
2521   * offset, such as
2522   * /..\G/:   gofs = 2
2523   * /ab|c\G/: gofs = 1
2524   * or if the minimum offset isn't known, then we have to go back
2525   * to the start of the string, e.g. /w+\G/
2526   */
2527
2528   if (prog->intflags & PREGf_ANCH_GPOS) {
2529    startpos  = reginfo->ganch - prog->gofs;
2530    if (startpos <
2531     ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2532    {
2533     DEBUG_r(PerlIO_printf(Perl_debug_log,
2534       "fail: ganch-gofs before earliest possible start\n"));
2535     return 0;
2536    }
2537   }
2538   else if (prog->gofs) {
2539    if (startpos - prog->gofs < strbeg)
2540     startpos = strbeg;
2541    else
2542     startpos -= prog->gofs;
2543   }
2544   else if (prog->intflags & PREGf_GPOS_FLOAT)
2545    startpos = strbeg;
2546  }
2547
2548  minlen = prog->minlen;
2549  if ((startpos + minlen) > strend || startpos < strbeg) {
2550   DEBUG_r(PerlIO_printf(Perl_debug_log,
2551      "Regex match can't succeed, so not even tried\n"));
2552   return 0;
2553  }
2554
2555  /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2556  * which will call destuctors to reset PL_regmatch_state, free higher
2557  * PL_regmatch_slabs, and clean up regmatch_info_aux and
2558  * regmatch_info_aux_eval */
2559
2560  oldsave = PL_savestack_ix;
2561
2562  s = startpos;
2563
2564  if ((prog->extflags & RXf_USE_INTUIT)
2565   && !(flags & REXEC_CHECKED))
2566  {
2567   s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2568          flags, NULL);
2569   if (!s)
2570    return 0;
2571
2572   if (prog->extflags & RXf_CHECK_ALL) {
2573    /* we can match based purely on the result of INTUIT.
2574    * Set up captures etc just for $& and $-[0]
2575    * (an intuit-only match wont have $1,$2,..) */
2576    assert(!prog->nparens);
2577
2578    /* s/// doesn't like it if $& is earlier than where we asked it to
2579    * start searching (which can happen on something like /.\G/) */
2580    if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2581      && (s < stringarg))
2582    {
2583     /* this should only be possible under \G */
2584     assert(prog->intflags & PREGf_GPOS_SEEN);
2585     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2586      "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2587     goto phooey;
2588    }
2589
2590    /* match via INTUIT shouldn't have any captures.
2591    * Let @-, @+, $^N know */
2592    prog->lastparen = prog->lastcloseparen = 0;
2593    RX_MATCH_UTF8_set(rx, utf8_target);
2594    prog->offs[0].start = s - strbeg;
2595    prog->offs[0].end = utf8_target
2596     ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2597     : s - strbeg + prog->minlenret;
2598    if ( !(flags & REXEC_NOT_FIRST) )
2599     S_reg_set_capture_string(aTHX_ rx,
2600           strbeg, strend,
2601           sv, flags, utf8_target);
2602
2603    return 1;
2604   }
2605  }
2606
2607  multiline = prog->extflags & RXf_PMf_MULTILINE;
2608
2609  if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2610   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2611        "String too short [regexec_flags]...\n"));
2612   goto phooey;
2613  }
2614
2615  /* Check validity of program. */
2616  if (UCHARAT(progi->program) != REG_MAGIC) {
2617   Perl_croak(aTHX_ "corrupted regexp program");
2618  }
2619
2620  RX_MATCH_TAINTED_off(rx);
2621
2622  reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2623  reginfo->intuit = 0;
2624  reginfo->is_utf8_target = cBOOL(utf8_target);
2625  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2626  reginfo->warned = FALSE;
2627  reginfo->strbeg  = strbeg;
2628  reginfo->sv = sv;
2629  reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2630  reginfo->strend = strend;
2631  /* see how far we have to get to not match where we matched before */
2632  reginfo->till = stringarg + minend;
2633
2634  if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2635   /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2636   S_cleanup_regmatch_info_aux has executed (registered by
2637   SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2638   magic belonging to this SV.
2639   Not newSVsv, either, as it does not COW.
2640   */
2641   assert(!IS_PADGV(sv));
2642   reginfo->sv = newSV(0);
2643   SvSetSV_nosteal(reginfo->sv, sv);
2644   SAVEFREESV(reginfo->sv);
2645  }
2646
2647  /* reserve next 2 or 3 slots in PL_regmatch_state:
2648  * slot N+0: may currently be in use: skip it
2649  * slot N+1: use for regmatch_info_aux struct
2650  * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2651  * slot N+3: ready for use by regmatch()
2652  */
2653
2654  {
2655   regmatch_state *old_regmatch_state;
2656   regmatch_slab  *old_regmatch_slab;
2657   int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2658
2659   /* on first ever match, allocate first slab */
2660   if (!PL_regmatch_slab) {
2661    Newx(PL_regmatch_slab, 1, regmatch_slab);
2662    PL_regmatch_slab->prev = NULL;
2663    PL_regmatch_slab->next = NULL;
2664    PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2665   }
2666
2667   old_regmatch_state = PL_regmatch_state;
2668   old_regmatch_slab  = PL_regmatch_slab;
2669
2670   for (i=0; i <= max; i++) {
2671    if (i == 1)
2672     reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2673    else if (i ==2)
2674     reginfo->info_aux_eval =
2675     reginfo->info_aux->info_aux_eval =
2676        &(PL_regmatch_state->u.info_aux_eval);
2677
2678    if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2679     PL_regmatch_state = S_push_slab(aTHX);
2680   }
2681
2682   /* note initial PL_regmatch_state position; at end of match we'll
2683   * pop back to there and free any higher slabs */
2684
2685   reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2686   reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2687   reginfo->info_aux->poscache = NULL;
2688
2689   SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2690
2691   if ((prog->extflags & RXf_EVAL_SEEN))
2692    S_setup_eval_state(aTHX_ reginfo);
2693   else
2694    reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2695  }
2696
2697  /* If there is a "must appear" string, look for it. */
2698
2699  if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2700   /* We have to be careful. If the previous successful match
2701   was from this regex we don't want a subsequent partially
2702   successful match to clobber the old results.
2703   So when we detect this possibility we add a swap buffer
2704   to the re, and switch the buffer each match. If we fail,
2705   we switch it back; otherwise we leave it swapped.
2706   */
2707   swap = prog->offs;
2708   /* do we need a save destructor here for eval dies? */
2709   Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2710   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2711    "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2712    PTR2UV(prog),
2713    PTR2UV(swap),
2714    PTR2UV(prog->offs)
2715   ));
2716  }
2717
2718  /* Simplest case:  anchored match need be tried only once. */
2719  /*  [unless only anchor is BOL and multiline is set] */
2720  if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
2721   if (s == startpos && regtry(reginfo, &s))
2722    goto got_it;
2723   else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
2724   {
2725    char *end;
2726
2727    if (minlen)
2728     dontbother = minlen - 1;
2729    end = HOP3c(strend, -dontbother, strbeg) - 1;
2730    /* for multiline we only have to try after newlines */
2731    if (prog->check_substr || prog->check_utf8) {
2732     /* because of the goto we can not easily reuse the macros for bifurcating the
2733     unicode/non-unicode match modes here like we do elsewhere - demerphq */
2734     if (utf8_target) {
2735      if (s == startpos)
2736       goto after_try_utf8;
2737      while (1) {
2738       if (regtry(reginfo, &s)) {
2739        goto got_it;
2740       }
2741      after_try_utf8:
2742       if (s > end) {
2743        goto phooey;
2744       }
2745       if (prog->extflags & RXf_USE_INTUIT) {
2746        s = re_intuit_start(rx, sv, strbeg,
2747          s + UTF8SKIP(s), strend, flags, NULL);
2748        if (!s) {
2749         goto phooey;
2750        }
2751       }
2752       else {
2753        s += UTF8SKIP(s);
2754       }
2755      }
2756     } /* end search for check string in unicode */
2757     else {
2758      if (s == startpos) {
2759       goto after_try_latin;
2760      }
2761      while (1) {
2762       if (regtry(reginfo, &s)) {
2763        goto got_it;
2764       }
2765      after_try_latin:
2766       if (s > end) {
2767        goto phooey;
2768       }
2769       if (prog->extflags & RXf_USE_INTUIT) {
2770        s = re_intuit_start(rx, sv, strbeg,
2771           s + 1, strend, flags, NULL);
2772        if (!s) {
2773         goto phooey;
2774        }
2775       }
2776       else {
2777        s++;
2778       }
2779      }
2780     } /* end search for check string in latin*/
2781    } /* end search for check string */
2782    else { /* search for newline */
2783     if (s > startpos) {
2784      /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2785      s--;
2786     }
2787     /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2788     while (s <= end) { /* note it could be possible to match at the end of the string */
2789      if (*s++ == '\n') { /* don't need PL_utf8skip here */
2790       if (regtry(reginfo, &s))
2791        goto got_it;
2792      }
2793     }
2794    } /* end search for newline */
2795   } /* end anchored/multiline check string search */
2796   goto phooey;
2797  } else if (prog->intflags & PREGf_ANCH_GPOS)
2798  {
2799   /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2800   assert(prog->intflags & PREGf_GPOS_SEEN);
2801   /* For anchored \G, the only position it can match from is
2802   * (ganch-gofs); we already set startpos to this above; if intuit
2803   * moved us on from there, we can't possibly succeed */
2804   assert(startpos == reginfo->ganch - prog->gofs);
2805   if (s == startpos && regtry(reginfo, &s))
2806    goto got_it;
2807   goto phooey;
2808  }
2809
2810  /* Messy cases:  unanchored match. */
2811  if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2812   /* we have /x+whatever/ */
2813   /* it must be a one character string (XXXX Except is_utf8_pat?) */
2814   char ch;
2815 #ifdef DEBUGGING
2816   int did_match = 0;
2817 #endif
2818   if (utf8_target) {
2819    if (! prog->anchored_utf8) {
2820     to_utf8_substr(prog);
2821    }
2822    ch = SvPVX_const(prog->anchored_utf8)[0];
2823    REXEC_FBC_SCAN(
2824     if (*s == ch) {
2825      DEBUG_EXECUTE_r( did_match = 1 );
2826      if (regtry(reginfo, &s)) goto got_it;
2827      s += UTF8SKIP(s);
2828      while (s < strend && *s == ch)
2829       s += UTF8SKIP(s);
2830     }
2831    );
2832
2833   }
2834   else {
2835    if (! prog->anchored_substr) {
2836     if (! to_byte_substr(prog)) {
2837      NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2838     }
2839    }
2840    ch = SvPVX_const(prog->anchored_substr)[0];
2841    REXEC_FBC_SCAN(
2842     if (*s == ch) {
2843      DEBUG_EXECUTE_r( did_match = 1 );
2844      if (regtry(reginfo, &s)) goto got_it;
2845      s++;
2846      while (s < strend && *s == ch)
2847       s++;
2848     }
2849    );
2850   }
2851   DEBUG_EXECUTE_r(if (!did_match)
2852     PerlIO_printf(Perl_debug_log,
2853         "Did not find anchored character...\n")
2854    );
2855  }
2856  else if (prog->anchored_substr != NULL
2857    || prog->anchored_utf8 != NULL
2858    || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2859     && prog->float_max_offset < strend - s)) {
2860   SV *must;
2861   SSize_t back_max;
2862   SSize_t back_min;
2863   char *last;
2864   char *last1;  /* Last position checked before */
2865 #ifdef DEBUGGING
2866   int did_match = 0;
2867 #endif
2868   if (prog->anchored_substr || prog->anchored_utf8) {
2869    if (utf8_target) {
2870     if (! prog->anchored_utf8) {
2871      to_utf8_substr(prog);
2872     }
2873     must = prog->anchored_utf8;
2874    }
2875    else {
2876     if (! prog->anchored_substr) {
2877      if (! to_byte_substr(prog)) {
2878       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2879      }
2880     }
2881     must = prog->anchored_substr;
2882    }
2883    back_max = back_min = prog->anchored_offset;
2884   } else {
2885    if (utf8_target) {
2886     if (! prog->float_utf8) {
2887      to_utf8_substr(prog);
2888     }
2889     must = prog->float_utf8;
2890    }
2891    else {
2892     if (! prog->float_substr) {
2893      if (! to_byte_substr(prog)) {
2894       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2895      }
2896     }
2897     must = prog->float_substr;
2898    }
2899    back_max = prog->float_max_offset;
2900    back_min = prog->float_min_offset;
2901   }
2902
2903   if (back_min<0) {
2904    last = strend;
2905   } else {
2906    last = HOP3c(strend, /* Cannot start after this */
2907     -(SSize_t)(CHR_SVLEN(must)
2908       - (SvTAIL(must) != 0) + back_min), strbeg);
2909   }
2910   if (s > reginfo->strbeg)
2911    last1 = HOPc(s, -1);
2912   else
2913    last1 = s - 1; /* bogus */
2914
2915   /* XXXX check_substr already used to find "s", can optimize if
2916   check_substr==must. */
2917   dontbother = 0;
2918   strend = HOPc(strend, -dontbother);
2919   while ( (s <= last) &&
2920     (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
2921         (unsigned char*)strend, must,
2922         multiline ? FBMrf_MULTILINE : 0)) ) {
2923    DEBUG_EXECUTE_r( did_match = 1 );
2924    if (HOPc(s, -back_max) > last1) {
2925     last1 = HOPc(s, -back_min);
2926     s = HOPc(s, -back_max);
2927    }
2928    else {
2929     char * const t = (last1 >= reginfo->strbeg)
2930          ? HOPc(last1, 1) : last1 + 1;
2931
2932     last1 = HOPc(s, -back_min);
2933     s = t;
2934    }
2935    if (utf8_target) {
2936     while (s <= last1) {
2937      if (regtry(reginfo, &s))
2938       goto got_it;
2939      if (s >= last1) {
2940       s++; /* to break out of outer loop */
2941       break;
2942      }
2943      s += UTF8SKIP(s);
2944     }
2945    }
2946    else {
2947     while (s <= last1) {
2948      if (regtry(reginfo, &s))
2949       goto got_it;
2950      s++;
2951     }
2952    }
2953   }
2954   DEBUG_EXECUTE_r(if (!did_match) {
2955    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2956     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2957    PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2958        ((must == prog->anchored_substr || must == prog->anchored_utf8)
2959        ? "anchored" : "floating"),
2960     quoted, RE_SV_TAIL(must));
2961   });
2962   goto phooey;
2963  }
2964  else if ( (c = progi->regstclass) ) {
2965   if (minlen) {
2966    const OPCODE op = OP(progi->regstclass);
2967    /* don't bother with what can't match */
2968    if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2969     strend = HOPc(strend, -(minlen - 1));
2970   }
2971   DEBUG_EXECUTE_r({
2972    SV * const prop = sv_newmortal();
2973    regprop(prog, prop, c, reginfo);
2974    {
2975     RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2976      s,strend-s,60);
2977     PerlIO_printf(Perl_debug_log,
2978      "Matching stclass %.*s against %s (%d bytes)\n",
2979      (int)SvCUR(prop), SvPVX_const(prop),
2980      quoted, (int)(strend - s));
2981    }
2982   });
2983   if (find_byclass(prog, c, s, strend, reginfo))
2984    goto got_it;
2985   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2986  }
2987  else {
2988   dontbother = 0;
2989   if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2990    /* Trim the end. */
2991    char *last= NULL;
2992    SV* float_real;
2993    STRLEN len;
2994    const char *little;
2995
2996    if (utf8_target) {
2997     if (! prog->float_utf8) {
2998      to_utf8_substr(prog);
2999     }
3000     float_real = prog->float_utf8;
3001    }
3002    else {
3003     if (! prog->float_substr) {
3004      if (! to_byte_substr(prog)) {
3005       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3006      }
3007     }
3008     float_real = prog->float_substr;
3009    }
3010
3011    little = SvPV_const(float_real, len);
3012    if (SvTAIL(float_real)) {
3013      /* This means that float_real contains an artificial \n on
3014      * the end due to the presence of something like this:
3015      * /foo$/ where we can match both "foo" and "foo\n" at the
3016      * end of the string.  So we have to compare the end of the
3017      * string first against the float_real without the \n and
3018      * then against the full float_real with the string.  We
3019      * have to watch out for cases where the string might be
3020      * smaller than the float_real or the float_real without
3021      * the \n. */
3022      char *checkpos= strend - len;
3023      DEBUG_OPTIMISE_r(
3024       PerlIO_printf(Perl_debug_log,
3025        "%sChecking for float_real.%s\n",
3026        PL_colors[4], PL_colors[5]));
3027      if (checkpos + 1 < strbeg) {
3028       /* can't match, even if we remove the trailing \n
3029       * string is too short to match */
3030       DEBUG_EXECUTE_r(
3031        PerlIO_printf(Perl_debug_log,
3032         "%sString shorter than required trailing substring, cannot match.%s\n",
3033         PL_colors[4], PL_colors[5]));
3034       goto phooey;
3035      } else if (memEQ(checkpos + 1, little, len - 1)) {
3036       /* can match, the end of the string matches without the
3037       * "\n" */
3038       last = checkpos + 1;
3039      } else if (checkpos < strbeg) {
3040       /* cant match, string is too short when the "\n" is
3041       * included */
3042       DEBUG_EXECUTE_r(
3043        PerlIO_printf(Perl_debug_log,
3044         "%sString does not contain required trailing substring, cannot match.%s\n",
3045         PL_colors[4], PL_colors[5]));
3046       goto phooey;
3047      } else if (!multiline) {
3048       /* non multiline match, so compare with the "\n" at the
3049       * end of the string */
3050       if (memEQ(checkpos, little, len)) {
3051        last= checkpos;
3052       } else {
3053        DEBUG_EXECUTE_r(
3054         PerlIO_printf(Perl_debug_log,
3055          "%sString does not contain required trailing substring, cannot match.%s\n",
3056          PL_colors[4], PL_colors[5]));
3057        goto phooey;
3058       }
3059      } else {
3060       /* multiline match, so we have to search for a place
3061       * where the full string is located */
3062       goto find_last;
3063      }
3064    } else {
3065     find_last:
3066      if (len)
3067       last = rninstr(s, strend, little, little + len);
3068      else
3069       last = strend; /* matching "$" */
3070    }
3071    if (!last) {
3072     /* at one point this block contained a comment which was
3073     * probably incorrect, which said that this was a "should not
3074     * happen" case.  Even if it was true when it was written I am
3075     * pretty sure it is not anymore, so I have removed the comment
3076     * and replaced it with this one. Yves */
3077     DEBUG_EXECUTE_r(
3078      PerlIO_printf(Perl_debug_log,
3079       "String does not contain required substring, cannot match.\n"
3080      ));
3081     goto phooey;
3082    }
3083    dontbother = strend - last + prog->float_min_offset;
3084   }
3085   if (minlen && (dontbother < minlen))
3086    dontbother = minlen - 1;
3087   strend -= dontbother;      /* this one's always in bytes! */
3088   /* We don't know much -- general case. */
3089   if (utf8_target) {
3090    for (;;) {
3091     if (regtry(reginfo, &s))
3092      goto got_it;
3093     if (s >= strend)
3094      break;
3095     s += UTF8SKIP(s);
3096    };
3097   }
3098   else {
3099    do {
3100     if (regtry(reginfo, &s))
3101      goto got_it;
3102    } while (s++ < strend);
3103   }
3104  }
3105
3106  /* Failure. */
3107  goto phooey;
3108
3109 got_it:
3110  /* s/// doesn't like it if $& is earlier than where we asked it to
3111  * start searching (which can happen on something like /.\G/) */
3112  if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3113    && (prog->offs[0].start < stringarg - strbeg))
3114  {
3115   /* this should only be possible under \G */
3116   assert(prog->intflags & PREGf_GPOS_SEEN);
3117   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3118    "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3119   goto phooey;
3120  }
3121
3122  DEBUG_BUFFERS_r(
3123   if (swap)
3124    PerlIO_printf(Perl_debug_log,
3125     "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3126     PTR2UV(prog),
3127     PTR2UV(swap)
3128    );
3129  );
3130  Safefree(swap);
3131
3132  /* clean up; this will trigger destructors that will free all slabs
3133  * above the current one, and cleanup the regmatch_info_aux
3134  * and regmatch_info_aux_eval sructs */
3135
3136  LEAVE_SCOPE(oldsave);
3137
3138  if (RXp_PAREN_NAMES(prog))
3139   (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3140
3141  RX_MATCH_UTF8_set(rx, utf8_target);
3142
3143  /* make sure $`, $&, $', and $digit will work later */
3144  if ( !(flags & REXEC_NOT_FIRST) )
3145   S_reg_set_capture_string(aTHX_ rx,
3146          strbeg, reginfo->strend,
3147          sv, flags, utf8_target);
3148
3149  return 1;
3150
3151 phooey:
3152  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3153       PL_colors[4], PL_colors[5]));
3154
3155  /* clean up; this will trigger destructors that will free all slabs
3156  * above the current one, and cleanup the regmatch_info_aux
3157  * and regmatch_info_aux_eval sructs */
3158
3159  LEAVE_SCOPE(oldsave);
3160
3161  if (swap) {
3162   /* we failed :-( roll it back */
3163   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3164    "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3165    PTR2UV(prog),
3166    PTR2UV(prog->offs),
3167    PTR2UV(swap)
3168   ));
3169   Safefree(prog->offs);
3170   prog->offs = swap;
3171  }
3172  return 0;
3173 }
3174
3175
3176 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3177  * Do inc before dec, in case old and new rex are the same */
3178 #define SET_reg_curpm(Re2)                          \
3179  if (reginfo->info_aux_eval) {                   \
3180   (void)ReREFCNT_inc(Re2);      \
3181   ReREFCNT_dec(PM_GETRE(PL_reg_curpm));     \
3182   PM_SETRE((PL_reg_curpm), (Re2));     \
3183  }
3184
3185
3186 /*
3187  - regtry - try match at specific point
3188  */
3189 STATIC I32   /* 0 failure, 1 success */
3190 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3191 {
3192  CHECKPOINT lastcp;
3193  REGEXP *const rx = reginfo->prog;
3194  regexp *const prog = ReANY(rx);
3195  SSize_t result;
3196  RXi_GET_DECL(prog,progi);
3197  GET_RE_DEBUG_FLAGS_DECL;
3198
3199  PERL_ARGS_ASSERT_REGTRY;
3200
3201  reginfo->cutpoint=NULL;
3202
3203  prog->offs[0].start = *startposp - reginfo->strbeg;
3204  prog->lastparen = 0;
3205  prog->lastcloseparen = 0;
3206
3207  /* XXXX What this code is doing here?!!!  There should be no need
3208  to do this again and again, prog->lastparen should take care of
3209  this!  --ilya*/
3210
3211  /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3212  * Actually, the code in regcppop() (which Ilya may be meaning by
3213  * prog->lastparen), is not needed at all by the test suite
3214  * (op/regexp, op/pat, op/split), but that code is needed otherwise
3215  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3216  * Meanwhile, this code *is* needed for the
3217  * above-mentioned test suite tests to succeed.  The common theme
3218  * on those tests seems to be returning null fields from matches.
3219  * --jhi updated by dapm */
3220 #if 1
3221  if (prog->nparens) {
3222   regexp_paren_pair *pp = prog->offs;
3223   I32 i;
3224   for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3225    ++pp;
3226    pp->start = -1;
3227    pp->end = -1;
3228   }
3229  }
3230 #endif
3231  REGCP_SET(lastcp);
3232  result = regmatch(reginfo, *startposp, progi->program + 1);
3233  if (result != -1) {
3234   prog->offs[0].end = result;
3235   return 1;
3236  }
3237  if (reginfo->cutpoint)
3238   *startposp= reginfo->cutpoint;
3239  REGCP_UNWIND(lastcp);
3240  return 0;
3241 }
3242
3243
3244 #define sayYES goto yes
3245 #define sayNO goto no
3246 #define sayNO_SILENT goto no_silent
3247
3248 /* we dont use STMT_START/END here because it leads to
3249    "unreachable code" warnings, which are bogus, but distracting. */
3250 #define CACHEsayNO \
3251  if (ST.cache_mask) \
3252  reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3253  sayNO
3254
3255 /* this is used to determine how far from the left messages like
3256    'failed...' are printed. It should be set such that messages
3257    are inline with the regop output that created them.
3258 */
3259 #define REPORT_CODE_OFF 32
3260
3261
3262 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3263 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3264 #define CHRTEST_NOT_A_CP_1 -999
3265 #define CHRTEST_NOT_A_CP_2 -998
3266
3267 /* grab a new slab and return the first slot in it */
3268
3269 STATIC regmatch_state *
3270 S_push_slab(pTHX)
3271 {
3272 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3273  dMY_CXT;
3274 #endif
3275  regmatch_slab *s = PL_regmatch_slab->next;
3276  if (!s) {
3277   Newx(s, 1, regmatch_slab);
3278   s->prev = PL_regmatch_slab;
3279   s->next = NULL;
3280   PL_regmatch_slab->next = s;
3281  }
3282  PL_regmatch_slab = s;
3283  return SLAB_FIRST(s);
3284 }
3285
3286
3287 /* push a new state then goto it */
3288
3289 #define PUSH_STATE_GOTO(state, node, input) \
3290  pushinput = input; \
3291  scan = node; \
3292  st->resume_state = state; \
3293  goto push_state;
3294
3295 /* push a new state with success backtracking, then goto it */
3296
3297 #define PUSH_YES_STATE_GOTO(state, node, input) \
3298  pushinput = input; \
3299  scan = node; \
3300  st->resume_state = state; \
3301  goto push_yes_state;
3302
3303
3304
3305
3306 /*
3307
3308 regmatch() - main matching routine
3309
3310 This is basically one big switch statement in a loop. We execute an op,
3311 set 'next' to point the next op, and continue. If we come to a point which
3312 we may need to backtrack to on failure such as (A|B|C), we push a
3313 backtrack state onto the backtrack stack. On failure, we pop the top
3314 state, and re-enter the loop at the state indicated. If there are no more
3315 states to pop, we return failure.
3316
3317 Sometimes we also need to backtrack on success; for example /A+/, where
3318 after successfully matching one A, we need to go back and try to
3319 match another one; similarly for lookahead assertions: if the assertion
3320 completes successfully, we backtrack to the state just before the assertion
3321 and then carry on.  In these cases, the pushed state is marked as
3322 'backtrack on success too'. This marking is in fact done by a chain of
3323 pointers, each pointing to the previous 'yes' state. On success, we pop to
3324 the nearest yes state, discarding any intermediate failure-only states.
3325 Sometimes a yes state is pushed just to force some cleanup code to be
3326 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3327 it to free the inner regex.
3328
3329 Note that failure backtracking rewinds the cursor position, while
3330 success backtracking leaves it alone.
3331
3332 A pattern is complete when the END op is executed, while a subpattern
3333 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3334 ops trigger the "pop to last yes state if any, otherwise return true"
3335 behaviour.
3336
3337 A common convention in this function is to use A and B to refer to the two
3338 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3339 the subpattern to be matched possibly multiple times, while B is the entire
3340 rest of the pattern. Variable and state names reflect this convention.
3341
3342 The states in the main switch are the union of ops and failure/success of
3343 substates associated with with that op.  For example, IFMATCH is the op
3344 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3345 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3346 successfully matched A and IFMATCH_A_fail is a state saying that we have
3347 just failed to match A. Resume states always come in pairs. The backtrack
3348 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3349 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3350 on success or failure.
3351
3352 The struct that holds a backtracking state is actually a big union, with
3353 one variant for each major type of op. The variable st points to the
3354 top-most backtrack struct. To make the code clearer, within each
3355 block of code we #define ST to alias the relevant union.
3356
3357 Here's a concrete example of a (vastly oversimplified) IFMATCH
3358 implementation:
3359
3360  switch (state) {
3361  ....
3362
3363 #define ST st->u.ifmatch
3364
3365  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3366   ST.foo = ...; // some state we wish to save
3367   ...
3368   // push a yes backtrack state with a resume value of
3369   // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3370   // first node of A:
3371   PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3372   // NOTREACHED
3373
3374  case IFMATCH_A: // we have successfully executed A; now continue with B
3375   next = B;
3376   bar = ST.foo; // do something with the preserved value
3377   break;
3378
3379  case IFMATCH_A_fail: // A failed, so the assertion failed
3380   ...;   // do some housekeeping, then ...
3381   sayNO; // propagate the failure
3382
3383 #undef ST
3384
3385  ...
3386  }
3387
3388 For any old-timers reading this who are familiar with the old recursive
3389 approach, the code above is equivalent to:
3390
3391  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3392  {
3393   int foo = ...
3394   ...
3395   if (regmatch(A)) {
3396    next = B;
3397    bar = foo;
3398    break;
3399   }
3400   ...;   // do some housekeeping, then ...
3401   sayNO; // propagate the failure
3402  }
3403
3404 The topmost backtrack state, pointed to by st, is usually free. If you
3405 want to claim it, populate any ST.foo fields in it with values you wish to
3406 save, then do one of
3407
3408   PUSH_STATE_GOTO(resume_state, node, newinput);
3409   PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3410
3411 which sets that backtrack state's resume value to 'resume_state', pushes a
3412 new free entry to the top of the backtrack stack, then goes to 'node'.
3413 On backtracking, the free slot is popped, and the saved state becomes the
3414 new free state. An ST.foo field in this new top state can be temporarily
3415 accessed to retrieve values, but once the main loop is re-entered, it
3416 becomes available for reuse.
3417
3418 Note that the depth of the backtrack stack constantly increases during the
3419 left-to-right execution of the pattern, rather than going up and down with
3420 the pattern nesting. For example the stack is at its maximum at Z at the
3421 end of the pattern, rather than at X in the following:
3422
3423  /(((X)+)+)+....(Y)+....Z/
3424
3425 The only exceptions to this are lookahead/behind assertions and the cut,
3426 (?>A), which pop all the backtrack states associated with A before
3427 continuing.
3428
3429 Backtrack state structs are allocated in slabs of about 4K in size.
3430 PL_regmatch_state and st always point to the currently active state,
3431 and PL_regmatch_slab points to the slab currently containing
3432 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3433 allocated, and is never freed until interpreter destruction. When the slab
3434 is full, a new one is allocated and chained to the end. At exit from
3435 regmatch(), slabs allocated since entry are freed.
3436
3437 */
3438
3439
3440 #define DEBUG_STATE_pp(pp)        \
3441  DEBUG_STATE_r({         \
3442   DUMP_EXEC_POS(locinput, scan, utf8_target);         \
3443   PerlIO_printf(Perl_debug_log,       \
3444    "    %*s"pp" %s%s%s%s%s\n",       \
3445    depth*2, "",        \
3446    PL_reg_name[st->resume_state],                  \
3447    ((st==yes_state||st==mark_state) ? "[" : ""),   \
3448    ((st==yes_state) ? "Y" : ""),                   \
3449    ((st==mark_state) ? "M" : ""),                  \
3450    ((st==yes_state||st==mark_state) ? "]" : "")    \
3451   );                                                  \
3452  });
3453
3454
3455 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3456
3457 #ifdef DEBUGGING
3458
3459 STATIC void
3460 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3461  const char *start, const char *end, const char *blurb)
3462 {
3463  const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3464
3465  PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3466
3467  if (!PL_colorset)
3468    reginitcolors();
3469  {
3470   RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3471    RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3472
3473   RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3474    start, end - start, 60);
3475
3476   PerlIO_printf(Perl_debug_log,
3477    "%s%s REx%s %s against %s\n",
3478      PL_colors[4], blurb, PL_colors[5], s0, s1);
3479
3480   if (utf8_target||utf8_pat)
3481    PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3482     utf8_pat ? "pattern" : "",
3483     utf8_pat && utf8_target ? " and " : "",
3484     utf8_target ? "string" : ""
3485    );
3486  }
3487 }
3488
3489 STATIC void
3490 S_dump_exec_pos(pTHX_ const char *locinput,
3491      const regnode *scan,
3492      const char *loc_regeol,
3493      const char *loc_bostr,
3494      const char *loc_reg_starttry,
3495      const bool utf8_target)
3496 {
3497  const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3498  const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3499  int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3500  /* The part of the string before starttry has one color
3501  (pref0_len chars), between starttry and current
3502  position another one (pref_len - pref0_len chars),
3503  after the current position the third one.
3504  We assume that pref0_len <= pref_len, otherwise we
3505  decrease pref0_len.  */
3506  int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3507   ? (5 + taill) - l : locinput - loc_bostr;
3508  int pref0_len;
3509
3510  PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3511
3512  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3513   pref_len++;
3514  pref0_len = pref_len  - (locinput - loc_reg_starttry);
3515  if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3516   l = ( loc_regeol - locinput > (5 + taill) - pref_len
3517    ? (5 + taill) - pref_len : loc_regeol - locinput);
3518  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3519   l--;
3520  if (pref0_len < 0)
3521   pref0_len = 0;
3522  if (pref0_len > pref_len)
3523   pref0_len = pref_len;
3524  {
3525   const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3526
3527   RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3528    (locinput - pref_len),pref0_len, 60, 4, 5);
3529
3530   RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3531      (locinput - pref_len + pref0_len),
3532      pref_len - pref0_len, 60, 2, 3);
3533
3534   RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3535      locinput, loc_regeol - locinput, 10, 0, 1);
3536
3537   const STRLEN tlen=len0+len1+len2;
3538   PerlIO_printf(Perl_debug_log,
3539      "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3540      (IV)(locinput - loc_bostr),
3541      len0, s0,
3542      len1, s1,
3543      (docolor ? "" : "> <"),
3544      len2, s2,
3545      (int)(tlen > 19 ? 0 :  19 - tlen),
3546      "");
3547  }
3548 }
3549
3550 #endif
3551
3552 /* reg_check_named_buff_matched()
3553  * Checks to see if a named buffer has matched. The data array of
3554  * buffer numbers corresponding to the buffer is expected to reside
3555  * in the regexp->data->data array in the slot stored in the ARG() of
3556  * node involved. Note that this routine doesn't actually care about the
3557  * name, that information is not preserved from compilation to execution.
3558  * Returns the index of the leftmost defined buffer with the given name
3559  * or 0 if non of the buffers matched.
3560  */
3561 STATIC I32
3562 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3563 {
3564  I32 n;
3565  RXi_GET_DECL(rex,rexi);
3566  SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3567  I32 *nums=(I32*)SvPVX(sv_dat);
3568
3569  PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3570
3571  for ( n=0; n<SvIVX(sv_dat); n++ ) {
3572   if ((I32)rex->lastparen >= nums[n] &&
3573    rex->offs[nums[n]].end != -1)
3574   {
3575    return nums[n];
3576   }
3577  }
3578  return 0;
3579 }
3580
3581
3582 static bool
3583 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3584   U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3585 {
3586  /* This function determines if there are one or two characters that match
3587  * the first character of the passed-in EXACTish node <text_node>, and if
3588  * so, returns them in the passed-in pointers.
3589  *
3590  * If it determines that no possible character in the target string can
3591  * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3592  * the first character in <text_node> requires UTF-8 to represent, and the
3593  * target string isn't in UTF-8.)
3594  *
3595  * If there are more than two characters that could match the beginning of
3596  * <text_node>, or if more context is required to determine a match or not,
3597  * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3598  *
3599  * The motiviation behind this function is to allow the caller to set up
3600  * tight loops for matching.  If <text_node> is of type EXACT, there is
3601  * only one possible character that can match its first character, and so
3602  * the situation is quite simple.  But things get much more complicated if
3603  * folding is involved.  It may be that the first character of an EXACTFish
3604  * node doesn't participate in any possible fold, e.g., punctuation, so it
3605  * can be matched only by itself.  The vast majority of characters that are
3606  * in folds match just two things, their lower and upper-case equivalents.
3607  * But not all are like that; some have multiple possible matches, or match
3608  * sequences of more than one character.  This function sorts all that out.
3609  *
3610  * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3611  * loop of trying to match A*, we know we can't exit where the thing
3612  * following it isn't a B.  And something can't be a B unless it is the
3613  * beginning of B.  By putting a quick test for that beginning in a tight
3614  * loop, we can rule out things that can't possibly be B without having to
3615  * break out of the loop, thus avoiding work.  Similarly, if A is a single
3616  * character, we can make a tight loop matching A*, using the outputs of
3617  * this function.
3618  *
3619  * If the target string to match isn't in UTF-8, and there aren't
3620  * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3621  * the one or two possible octets (which are characters in this situation)
3622  * that can match.  In all cases, if there is only one character that can
3623  * match, *<c1p> and *<c2p> will be identical.
3624  *
3625  * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3626  * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3627  * can match the beginning of <text_node>.  They should be declared with at
3628  * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3629  * undefined what these contain.)  If one or both of the buffers are
3630  * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3631  * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3632  * *<c2p> will be set to a negative number(s) that shouldn't match any code
3633  * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3634  * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3635
3636  const bool utf8_target = reginfo->is_utf8_target;
3637
3638  UV c1 = (UV)CHRTEST_NOT_A_CP_1;
3639  UV c2 = (UV)CHRTEST_NOT_A_CP_2;
3640  bool use_chrtest_void = FALSE;
3641  const bool is_utf8_pat = reginfo->is_utf8_pat;
3642
3643  /* Used when we have both utf8 input and utf8 output, to avoid converting
3644  * to/from code points */
3645  bool utf8_has_been_setup = FALSE;
3646
3647  dVAR;
3648
3649  U8 *pat = (U8*)STRING(text_node);
3650  U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3651
3652  if (OP(text_node) == EXACT) {
3653
3654   /* In an exact node, only one thing can be matched, that first
3655   * character.  If both the pat and the target are UTF-8, we can just
3656   * copy the input to the output, avoiding finding the code point of
3657   * that character */
3658   if (!is_utf8_pat) {
3659    c2 = c1 = *pat;
3660   }
3661   else if (utf8_target) {
3662    Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3663    Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3664    utf8_has_been_setup = TRUE;
3665   }
3666   else {
3667    c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3668   }
3669  }
3670  else { /* an EXACTFish node */
3671   U8 *pat_end = pat + STR_LEN(text_node);
3672
3673   /* An EXACTFL node has at least some characters unfolded, because what
3674   * they match is not known until now.  So, now is the time to fold
3675   * the first few of them, as many as are needed to determine 'c1' and
3676   * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
3677   * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3678   * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
3679   * need to fold as many characters as a single character can fold to,
3680   * so that later we can check if the first ones are such a multi-char
3681   * fold.  But, in such a pattern only locale-problematic characters
3682   * aren't folded, so we can skip this completely if the first character
3683   * in the node isn't one of the tricky ones */
3684   if (OP(text_node) == EXACTFL) {
3685
3686    if (! is_utf8_pat) {
3687     if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3688     {
3689      folded[0] = folded[1] = 's';
3690      pat = folded;
3691      pat_end = folded + 2;
3692     }
3693    }
3694    else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3695     U8 *s = pat;
3696     U8 *d = folded;
3697     int i;
3698
3699     for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3700      if (isASCII(*s)) {
3701       *(d++) = (U8) toFOLD_LC(*s);
3702       s++;
3703      }
3704      else {
3705       STRLEN len;
3706       _to_utf8_fold_flags(s,
3707            d,
3708            &len,
3709            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3710       d += len;
3711       s += UTF8SKIP(s);
3712      }
3713     }
3714
3715     pat = folded;
3716     pat_end = d;
3717    }
3718   }
3719
3720   if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
3721    || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
3722   {
3723    /* Multi-character folds require more context to sort out.  Also
3724    * PL_utf8_foldclosures used below doesn't handle them, so have to
3725    * be handled outside this routine */
3726    use_chrtest_void = TRUE;
3727   }
3728   else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3729    c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3730    if (c1 > 255) {
3731     /* Load the folds hash, if not already done */
3732     SV** listp;
3733     if (! PL_utf8_foldclosures) {
3734      _load_PL_utf8_foldclosures();
3735     }
3736
3737     /* The fold closures data structure is a hash with the keys
3738     * being the UTF-8 of every character that is folded to, like
3739     * 'k', and the values each an array of all code points that
3740     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
3741     * Multi-character folds are not included */
3742     if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3743           (char *) pat,
3744           UTF8SKIP(pat),
3745           FALSE))))
3746     {
3747      /* Not found in the hash, therefore there are no folds
3748      * containing it, so there is only a single character that
3749      * could match */
3750      c2 = c1;
3751     }
3752     else {  /* Does participate in folds */
3753      AV* list = (AV*) *listp;
3754      if (av_tindex(list) != 1) {
3755
3756       /* If there aren't exactly two folds to this, it is
3757       * outside the scope of this function */
3758       use_chrtest_void = TRUE;
3759      }
3760      else {  /* There are two.  Get them */
3761       SV** c_p = av_fetch(list, 0, FALSE);
3762       if (c_p == NULL) {
3763        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3764       }
3765       c1 = SvUV(*c_p);
3766
3767       c_p = av_fetch(list, 1, FALSE);
3768       if (c_p == NULL) {
3769        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3770       }
3771       c2 = SvUV(*c_p);
3772
3773       /* Folds that cross the 255/256 boundary are forbidden
3774       * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
3775       * one is ASCIII.  Since the pattern character is above
3776       * 255, and its only other match is below 256, the only
3777       * legal match will be to itself.  We have thrown away
3778       * the original, so have to compute which is the one
3779       * above 255. */
3780       if ((c1 < 256) != (c2 < 256)) {
3781        if ((OP(text_node) == EXACTFL
3782         && ! IN_UTF8_CTYPE_LOCALE)
3783         || ((OP(text_node) == EXACTFA
3784          || OP(text_node) == EXACTFA_NO_TRIE)
3785          && (isASCII(c1) || isASCII(c2))))
3786        {
3787         if (c1 < 256) {
3788          c1 = c2;
3789         }
3790         else {
3791          c2 = c1;
3792         }
3793        }
3794       }
3795      }
3796     }
3797    }
3798    else /* Here, c1 is <= 255 */
3799     if (utf8_target
3800      && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3801      && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
3802      && ((OP(text_node) != EXACTFA
3803       && OP(text_node) != EXACTFA_NO_TRIE)
3804       || ! isASCII(c1)))
3805    {
3806     /* Here, there could be something above Latin1 in the target
3807     * which folds to this character in the pattern.  All such
3808     * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
3809     * than two characters involved in their folds, so are outside
3810     * the scope of this function */
3811     if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3812      c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3813     }
3814     else {
3815      use_chrtest_void = TRUE;
3816     }
3817    }
3818    else { /* Here nothing above Latin1 can fold to the pattern
3819      character */
3820     switch (OP(text_node)) {
3821
3822      case EXACTFL:   /* /l rules */
3823       c2 = PL_fold_locale[c1];
3824       break;
3825
3826      case EXACTF:   /* This node only generated for non-utf8
3827          patterns */
3828       assert(! is_utf8_pat);
3829       if (! utf8_target) {    /* /d rules */
3830        c2 = PL_fold[c1];
3831        break;
3832       }
3833       /* FALLTHROUGH */
3834       /* /u rules for all these.  This happens to work for
3835       * EXACTFA as nothing in Latin1 folds to ASCII */
3836      case EXACTFA_NO_TRIE:   /* This node only generated for
3837            non-utf8 patterns */
3838       assert(! is_utf8_pat);
3839       /* FALLTHROUGH */
3840      case EXACTFA:
3841      case EXACTFU_SS:
3842      case EXACTFU:
3843       c2 = PL_fold_latin1[c1];
3844       break;
3845
3846      default:
3847       Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3848       assert(0); /* NOTREACHED */
3849     }
3850    }
3851   }
3852  }
3853
3854  /* Here have figured things out.  Set up the returns */
3855  if (use_chrtest_void) {
3856   *c2p = *c1p = CHRTEST_VOID;
3857  }
3858  else if (utf8_target) {
3859   if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3860    uvchr_to_utf8(c1_utf8, c1);
3861    uvchr_to_utf8(c2_utf8, c2);
3862   }
3863
3864   /* Invariants are stored in both the utf8 and byte outputs; Use
3865   * negative numbers otherwise for the byte ones.  Make sure that the
3866   * byte ones are the same iff the utf8 ones are the same */
3867   *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3868   *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3869     ? *c2_utf8
3870     : (c1 == c2)
3871     ? CHRTEST_NOT_A_CP_1
3872     : CHRTEST_NOT_A_CP_2;
3873  }
3874  else if (c1 > 255) {
3875  if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3876       can represent */
3877   return FALSE;
3878  }
3879
3880  *c1p = *c2p = c2;    /* c2 is the only representable value */
3881  }
3882  else {  /* c1 is representable; see about c2 */
3883  *c1p = c1;
3884  *c2p = (c2 < 256) ? c2 : c1;
3885  }
3886
3887  return TRUE;
3888 }
3889
3890 /* returns -1 on failure, $+[0] on success */
3891 STATIC SSize_t
3892 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3893 {
3894 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3895  dMY_CXT;
3896 #endif
3897  dVAR;
3898  const bool utf8_target = reginfo->is_utf8_target;
3899  const U32 uniflags = UTF8_ALLOW_DEFAULT;
3900  REGEXP *rex_sv = reginfo->prog;
3901  regexp *rex = ReANY(rex_sv);
3902  RXi_GET_DECL(rex,rexi);
3903  /* the current state. This is a cached copy of PL_regmatch_state */
3904  regmatch_state *st;
3905  /* cache heavy used fields of st in registers */
3906  regnode *scan;
3907  regnode *next;
3908  U32 n = 0; /* general value; init to avoid compiler warning */
3909  SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
3910  char *locinput = startpos;
3911  char *pushinput; /* where to continue after a PUSH */
3912  I32 nextchr;   /* is always set to UCHARAT(locinput) */
3913
3914  bool result = 0;     /* return value of S_regmatch */
3915  int depth = 0;     /* depth of backtrack stack */
3916  U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3917  const U32 max_nochange_depth =
3918   (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3919   3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3920  regmatch_state *yes_state = NULL; /* state to pop to on success of
3921                subpattern */
3922  /* mark_state piggy backs on the yes_state logic so that when we unwind
3923  the stack on success we can update the mark_state as we go */
3924  regmatch_state *mark_state = NULL; /* last mark state we have seen */
3925  regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3926  struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3927  U32 state_num;
3928  bool no_final = 0;      /* prevent failure from backtracking? */
3929  bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3930  char *startpoint = locinput;
3931  SV *popmark = NULL;     /* are we looking for a mark? */
3932  SV *sv_commit = NULL;   /* last mark name seen in failure */
3933  SV *sv_yes_mark = NULL; /* last mark name we have seen
3934        during a successful match */
3935  U32 lastopen = 0;       /* last open we saw */
3936  bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3937  SV* const oreplsv = GvSVn(PL_replgv);
3938  /* these three flags are set by various ops to signal information to
3939  * the very next op. They have a useful lifetime of exactly one loop
3940  * iteration, and are not preserved or restored by state pushes/pops
3941  */
3942  bool sw = 0;     /* the condition value in (?(cond)a|b) */
3943  bool minmod = 0;     /* the next "{n,m}" is a "{n,m}?" */
3944  int logical = 0;     /* the following EVAL is:
3945         0: (?{...})
3946         1: (?(?{...})X|Y)
3947         2: (??{...})
3948        or the following IFMATCH/UNLESSM is:
3949         false: plain (?=foo)
3950         true:  used as a condition: (?(?=foo))
3951        */
3952  PAD* last_pad = NULL;
3953  dMULTICALL;
3954  I32 gimme = G_SCALAR;
3955  CV *caller_cv = NULL; /* who called us */
3956  CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3957  CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3958  U32 maxopenparen = 0;       /* max '(' index seen so far */
3959  int to_complement;  /* Invert the result? */
3960  _char_class_number classnum;
3961  bool is_utf8_pat = reginfo->is_utf8_pat;
3962
3963 #ifdef DEBUGGING
3964  GET_RE_DEBUG_FLAGS_DECL;
3965 #endif
3966
3967  /* protect against undef(*^R) */
3968  SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
3969
3970  /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3971  multicall_oldcatch = 0;
3972  multicall_cv = NULL;
3973  cx = NULL;
3974  PERL_UNUSED_VAR(multicall_cop);
3975  PERL_UNUSED_VAR(newsp);
3976
3977
3978  PERL_ARGS_ASSERT_REGMATCH;
3979
3980  DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3981    PerlIO_printf(Perl_debug_log,"regmatch start\n");
3982  }));
3983
3984  st = PL_regmatch_state;
3985
3986  /* Note that nextchr is a byte even in UTF */
3987  SET_nextchr;
3988  scan = prog;
3989  while (scan != NULL) {
3990
3991   DEBUG_EXECUTE_r( {
3992    SV * const prop = sv_newmortal();
3993    regnode *rnext=regnext(scan);
3994    DUMP_EXEC_POS( locinput, scan, utf8_target );
3995    regprop(rex, prop, scan, reginfo);
3996
3997    PerlIO_printf(Perl_debug_log,
3998      "%3"IVdf":%*s%s(%"IVdf")\n",
3999      (IV)(scan - rexi->program), depth*2, "",
4000      SvPVX_const(prop),
4001      (PL_regkind[OP(scan)] == END || !rnext) ?
4002       0 : (IV)(rnext - rexi->program));
4003   });
4004
4005   next = scan + NEXT_OFF(scan);
4006   if (next == scan)
4007    next = NULL;
4008   state_num = OP(scan);
4009
4010   REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
4011  reenter_switch:
4012   to_complement = 0;
4013
4014   SET_nextchr;
4015   assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
4016
4017   switch (state_num) {
4018   case BOL:  /*  /^../   */
4019   case SBOL: /*  /^../s  */
4020    if (locinput == reginfo->strbeg)
4021     break;
4022    sayNO;
4023
4024   case MBOL: /*  /^../m  */
4025    if (locinput == reginfo->strbeg ||
4026     (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
4027    {
4028     break;
4029    }
4030    sayNO;
4031
4032   case GPOS: /*  \G  */
4033    if (locinput == reginfo->ganch)
4034     break;
4035    sayNO;
4036
4037   case KEEPS: /*   \K  */
4038    /* update the startpoint */
4039    st->u.keeper.val = rex->offs[0].start;
4040    rex->offs[0].start = locinput - reginfo->strbeg;
4041    PUSH_STATE_GOTO(KEEPS_next, next, locinput);
4042    /* NOTREACHED */
4043    assert(0);
4044
4045   case KEEPS_next_fail:
4046    /* rollback the start point change */
4047    rex->offs[0].start = st->u.keeper.val;
4048    sayNO_SILENT;
4049    /* NOTREACHED */
4050    assert(0);
4051
4052   case MEOL: /* /..$/m  */
4053    if (!NEXTCHR_IS_EOS && nextchr != '\n')
4054     sayNO;
4055    break;
4056
4057   case EOL: /* /..$/  */
4058    /* FALLTHROUGH */
4059   case SEOL: /* /..$/s  */
4060    if (!NEXTCHR_IS_EOS && nextchr != '\n')
4061     sayNO;
4062    if (reginfo->strend - locinput > 1)
4063     sayNO;
4064    break;
4065
4066   case EOS: /*  \z  */
4067    if (!NEXTCHR_IS_EOS)
4068     sayNO;
4069    break;
4070
4071   case SANY: /*  /./s  */
4072    if (NEXTCHR_IS_EOS)
4073     sayNO;
4074    goto increment_locinput;
4075
4076   case CANY: /*  \C  */
4077    if (NEXTCHR_IS_EOS)
4078     sayNO;
4079    locinput++;
4080    break;
4081
4082   case REG_ANY: /*  /./  */
4083    if ((NEXTCHR_IS_EOS) || nextchr == '\n')
4084     sayNO;
4085    goto increment_locinput;
4086
4087
4088 #undef  ST
4089 #define ST st->u.trie
4090   case TRIEC: /* (ab|cd) with known charclass */
4091    /* In this case the charclass data is available inline so
4092    we can fail fast without a lot of extra overhead.
4093    */
4094    if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
4095     DEBUG_EXECUTE_r(
4096      PerlIO_printf(Perl_debug_log,
4097        "%*s  %sfailed to match trie start class...%s\n",
4098        REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4099     );
4100     sayNO_SILENT;
4101     /* NOTREACHED */
4102     assert(0);
4103    }
4104    /* FALLTHROUGH */
4105   case TRIE:  /* (ab|cd)  */
4106    /* the basic plan of execution of the trie is:
4107    * At the beginning, run though all the states, and
4108    * find the longest-matching word. Also remember the position
4109    * of the shortest matching word. For example, this pattern:
4110    *    1  2 3 4    5
4111    *    ab|a|x|abcd|abc
4112    * when matched against the string "abcde", will generate
4113    * accept states for all words except 3, with the longest
4114    * matching word being 4, and the shortest being 2 (with
4115    * the position being after char 1 of the string).
4116    *
4117    * Then for each matching word, in word order (i.e. 1,2,4,5),
4118    * we run the remainder of the pattern; on each try setting
4119    * the current position to the character following the word,
4120    * returning to try the next word on failure.
4121    *
4122    * We avoid having to build a list of words at runtime by
4123    * using a compile-time structure, wordinfo[].prev, which
4124    * gives, for each word, the previous accepting word (if any).
4125    * In the case above it would contain the mappings 1->2, 2->0,
4126    * 3->0, 4->5, 5->1.  We can use this table to generate, from
4127    * the longest word (4 above), a list of all words, by
4128    * following the list of prev pointers; this gives us the
4129    * unordered list 4,5,1,2. Then given the current word we have
4130    * just tried, we can go through the list and find the
4131    * next-biggest word to try (so if we just failed on word 2,
4132    * the next in the list is 4).
4133    *
4134    * Since at runtime we don't record the matching position in
4135    * the string for each word, we have to work that out for
4136    * each word we're about to process. The wordinfo table holds
4137    * the character length of each word; given that we recorded
4138    * at the start: the position of the shortest word and its
4139    * length in chars, we just need to move the pointer the
4140    * difference between the two char lengths. Depending on
4141    * Unicode status and folding, that's cheap or expensive.
4142    *
4143    * This algorithm is optimised for the case where are only a
4144    * small number of accept states, i.e. 0,1, or maybe 2.
4145    * With lots of accepts states, and having to try all of them,
4146    * it becomes quadratic on number of accept states to find all
4147    * the next words.
4148    */
4149
4150    {
4151     /* what type of TRIE am I? (utf8 makes this contextual) */
4152     DECL_TRIE_TYPE(scan);
4153
4154     /* what trie are we using right now */
4155     reg_trie_data * const trie
4156      = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
4157     HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
4158     U32 state = trie->startstate;
4159
4160     if (   trie->bitmap
4161      && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
4162     {
4163      if (trie->states[ state ].wordnum) {
4164       DEBUG_EXECUTE_r(
4165        PerlIO_printf(Perl_debug_log,
4166           "%*s  %smatched empty string...%s\n",
4167           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4168       );
4169       if (!trie->jump)
4170        break;
4171      } else {
4172       DEBUG_EXECUTE_r(
4173        PerlIO_printf(Perl_debug_log,
4174           "%*s  %sfailed to match trie start class...%s\n",
4175           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4176       );
4177       sayNO_SILENT;
4178     }
4179     }
4180
4181    {
4182     U8 *uc = ( U8* )locinput;
4183
4184     STRLEN len = 0;
4185     STRLEN foldlen = 0;
4186     U8 *uscan = (U8*)NULL;
4187     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
4188     U32 charcount = 0; /* how many input chars we have matched */
4189     U32 accepted = 0; /* have we seen any accepting states? */
4190
4191     ST.jump = trie->jump;
4192     ST.me = scan;
4193     ST.firstpos = NULL;
4194     ST.longfold = FALSE; /* char longer if folded => it's harder */
4195     ST.nextword = 0;
4196
4197     /* fully traverse the TRIE; note the position of the
4198     shortest accept state and the wordnum of the longest
4199     accept state */
4200
4201     while ( state && uc <= (U8*)(reginfo->strend) ) {
4202      U32 base = trie->states[ state ].trans.base;
4203      UV uvc = 0;
4204      U16 charid = 0;
4205      U16 wordnum;
4206      wordnum = trie->states[ state ].wordnum;
4207
4208      if (wordnum) { /* it's an accept state */
4209       if (!accepted) {
4210        accepted = 1;
4211        /* record first match position */
4212        if (ST.longfold) {
4213         ST.firstpos = (U8*)locinput;
4214         ST.firstchars = 0;
4215        }
4216        else {
4217         ST.firstpos = uc;
4218         ST.firstchars = charcount;
4219        }
4220       }
4221       if (!ST.nextword || wordnum < ST.nextword)
4222        ST.nextword = wordnum;
4223       ST.topword = wordnum;
4224      }
4225
4226      DEBUG_TRIE_EXECUTE_r({
4227         DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
4228         PerlIO_printf( Perl_debug_log,
4229          "%*s  %sState: %4"UVxf" Accepted: %c ",
4230          2+depth * 2, "", PL_colors[4],
4231          (UV)state, (accepted ? 'Y' : 'N'));
4232      });
4233
4234      /* read a char and goto next state */
4235      if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
4236       I32 offset;
4237       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
4238            uscan, len, uvc, charid, foldlen,
4239            foldbuf, uniflags);
4240       charcount++;
4241       if (foldlen>0)
4242        ST.longfold = TRUE;
4243       if (charid &&
4244        ( ((offset =
4245        base + charid - 1 - trie->uniquecharcount)) >= 0)
4246
4247        && ((U32)offset < trie->lasttrans)
4248        && trie->trans[offset].check == state)
4249       {
4250        state = trie->trans[offset].next;
4251       }
4252       else {
4253        state = 0;
4254       }
4255       uc += len;
4256
4257      }
4258      else {
4259       state = 0;
4260      }
4261      DEBUG_TRIE_EXECUTE_r(
4262       PerlIO_printf( Perl_debug_log,
4263        "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
4264        charid, uvc, (UV)state, PL_colors[5] );
4265      );
4266     }
4267     if (!accepted)
4268     sayNO;
4269
4270     /* calculate total number of accept states */
4271     {
4272      U16 w = ST.topword;
4273      accepted = 0;
4274      while (w) {
4275       w = trie->wordinfo[w].prev;
4276       accepted++;
4277      }
4278      ST.accepted = accepted;
4279     }
4280
4281     DEBUG_EXECUTE_r(
4282      PerlIO_printf( Perl_debug_log,
4283       "%*s  %sgot %"IVdf" possible matches%s\n",
4284       REPORT_CODE_OFF + depth * 2, "",
4285       PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4286     );
4287     goto trie_first_try; /* jump into the fail handler */
4288    }}
4289    /* NOTREACHED */
4290    assert(0);
4291
4292   case TRIE_next_fail: /* we failed - try next alternative */
4293   {
4294    U8 *uc;
4295    if ( ST.jump) {
4296     REGCP_UNWIND(ST.cp);
4297     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
4298    }
4299    if (!--ST.accepted) {
4300     DEBUG_EXECUTE_r({
4301      PerlIO_printf( Perl_debug_log,
4302       "%*s  %sTRIE failed...%s\n",
4303       REPORT_CODE_OFF+depth*2, "",
4304       PL_colors[4],
4305       PL_colors[5] );
4306     });
4307     sayNO_SILENT;
4308    }
4309    {
4310     /* Find next-highest word to process.  Note that this code
4311     * is O(N^2) per trie run (O(N) per branch), so keep tight */
4312     U16 min = 0;
4313     U16 word;
4314     U16 const nextword = ST.nextword;
4315     reg_trie_wordinfo * const wordinfo
4316      = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4317     for (word=ST.topword; word; word=wordinfo[word].prev) {
4318      if (word > nextword && (!min || word < min))
4319       min = word;
4320     }
4321     ST.nextword = min;
4322    }
4323
4324   trie_first_try:
4325    if (do_cutgroup) {
4326     do_cutgroup = 0;
4327     no_final = 0;
4328    }
4329
4330    if ( ST.jump) {
4331     ST.lastparen = rex->lastparen;
4332     ST.lastcloseparen = rex->lastcloseparen;
4333     REGCP_SET(ST.cp);
4334    }
4335
4336    /* find start char of end of current word */
4337    {
4338     U32 chars; /* how many chars to skip */
4339     reg_trie_data * const trie
4340      = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4341
4342     assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4343        >=  ST.firstchars);
4344     chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4345        - ST.firstchars;
4346     uc = ST.firstpos;
4347
4348     if (ST.longfold) {
4349      /* the hard option - fold each char in turn and find
4350      * its folded length (which may be different */
4351      U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4352      STRLEN foldlen;
4353      STRLEN len;
4354      UV uvc;
4355      U8 *uscan;
4356
4357      while (chars) {
4358       if (utf8_target) {
4359        uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
4360              uniflags);
4361        uc += len;
4362       }
4363       else {
4364        uvc = *uc;
4365        uc++;
4366       }
4367       uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4368       uscan = foldbuf;
4369       while (foldlen) {
4370        if (!--chars)
4371         break;
4372        uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
4373            uniflags);
4374        uscan += len;
4375        foldlen -= len;
4376       }
4377      }
4378     }
4379     else {
4380      if (utf8_target)
4381       while (chars--)
4382        uc += UTF8SKIP(uc);
4383      else
4384       uc += chars;
4385     }
4386    }
4387
4388    scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4389        ? ST.jump[ST.nextword]
4390        : NEXT_OFF(ST.me));
4391
4392    DEBUG_EXECUTE_r({
4393     PerlIO_printf( Perl_debug_log,
4394      "%*s  %sTRIE matched word #%d, continuing%s\n",
4395      REPORT_CODE_OFF+depth*2, "",
4396      PL_colors[4],
4397      ST.nextword,
4398      PL_colors[5]
4399      );
4400    });
4401
4402    if (ST.accepted > 1 || has_cutgroup) {
4403     PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4404     /* NOTREACHED */
4405     assert(0);
4406    }
4407    /* only one choice left - just continue */
4408    DEBUG_EXECUTE_r({
4409     AV *const trie_words
4410      = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4411     SV ** const tmp = av_fetch( trie_words,
4412      ST.nextword-1, 0 );
4413     SV *sv= tmp ? sv_newmortal() : NULL;
4414
4415     PerlIO_printf( Perl_debug_log,
4416      "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4417      REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4418      ST.nextword,
4419      tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4420        PL_colors[0], PL_colors[1],
4421        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4422       )
4423      : "not compiled under -Dr",
4424      PL_colors[5] );
4425    });
4426
4427    locinput = (char*)uc;
4428    continue; /* execute rest of RE */
4429    /* NOTREACHED */
4430    assert(0);
4431   }
4432 #undef  ST
4433
4434   case EXACT: {            /*  /abc/        */
4435    char *s = STRING(scan);
4436    ln = STR_LEN(scan);
4437    if (utf8_target != is_utf8_pat) {
4438     /* The target and the pattern have differing utf8ness. */
4439     char *l = locinput;
4440     const char * const e = s + ln;
4441
4442     if (utf8_target) {
4443      /* The target is utf8, the pattern is not utf8.
4444      * Above-Latin1 code points can't match the pattern;
4445      * invariants match exactly, and the other Latin1 ones need
4446      * to be downgraded to a single byte in order to do the
4447      * comparison.  (If we could be confident that the target
4448      * is not malformed, this could be refactored to have fewer
4449      * tests by just assuming that if the first bytes match, it
4450      * is an invariant, but there are tests in the test suite
4451      * dealing with (??{...}) which violate this) */
4452      while (s < e) {
4453       if (l >= reginfo->strend
4454        || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4455       {
4456        sayNO;
4457       }
4458       if (UTF8_IS_INVARIANT(*(U8*)l)) {
4459        if (*l != *s) {
4460         sayNO;
4461        }
4462        l++;
4463       }
4464       else {
4465        if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4466        {
4467         sayNO;
4468        }
4469        l += 2;
4470       }
4471       s++;
4472      }
4473     }
4474     else {
4475      /* The target is not utf8, the pattern is utf8. */
4476      while (s < e) {
4477       if (l >= reginfo->strend
4478        || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4479       {
4480        sayNO;
4481       }
4482       if (UTF8_IS_INVARIANT(*(U8*)s)) {
4483        if (*s != *l) {
4484         sayNO;
4485        }
4486        s++;
4487       }
4488       else {
4489        if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4490        {
4491         sayNO;
4492        }
4493        s += 2;
4494       }
4495       l++;
4496      }
4497     }
4498     locinput = l;
4499    }
4500    else {
4501     /* The target and the pattern have the same utf8ness. */
4502     /* Inline the first character, for speed. */
4503     if (reginfo->strend - locinput < ln
4504      || UCHARAT(s) != nextchr
4505      || (ln > 1 && memNE(s, locinput, ln)))
4506     {
4507      sayNO;
4508     }
4509     locinput += ln;
4510    }
4511    break;
4512    }
4513
4514   case EXACTFL: {          /*  /abc/il      */
4515    re_fold_t folder;
4516    const U8 * fold_array;
4517    const char * s;
4518    U32 fold_utf8_flags;
4519
4520    folder = foldEQ_locale;
4521    fold_array = PL_fold_locale;
4522    fold_utf8_flags = FOLDEQ_LOCALE;
4523    goto do_exactf;
4524
4525   case EXACTFU_SS:         /*  /\x{df}/iu   */
4526   case EXACTFU:            /*  /abc/iu      */
4527    folder = foldEQ_latin1;
4528    fold_array = PL_fold_latin1;
4529    fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4530    goto do_exactf;
4531
4532   case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
4533         patterns */
4534    assert(! is_utf8_pat);
4535    /* FALLTHROUGH */
4536   case EXACTFA:            /*  /abc/iaa     */
4537    folder = foldEQ_latin1;
4538    fold_array = PL_fold_latin1;
4539    fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4540    goto do_exactf;
4541
4542   case EXACTF:             /*  /abc/i    This node only generated for
4543            non-utf8 patterns */
4544    assert(! is_utf8_pat);
4545    folder = foldEQ;
4546    fold_array = PL_fold;
4547    fold_utf8_flags = 0;
4548
4549   do_exactf:
4550    s = STRING(scan);
4551    ln = STR_LEN(scan);
4552
4553    if (utf8_target
4554     || is_utf8_pat
4555     || state_num == EXACTFU_SS
4556     || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4557    {
4558    /* Either target or the pattern are utf8, or has the issue where
4559    * the fold lengths may differ. */
4560     const char * const l = locinput;
4561     char *e = reginfo->strend;
4562
4563     if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4564           l, &e, 0,  utf8_target, fold_utf8_flags))
4565     {
4566      sayNO;
4567     }
4568     locinput = e;
4569     break;
4570    }
4571
4572    /* Neither the target nor the pattern are utf8 */
4573    if (UCHARAT(s) != nextchr
4574     && !NEXTCHR_IS_EOS
4575     && UCHARAT(s) != fold_array[nextchr])
4576    {
4577     sayNO;
4578    }
4579    if (reginfo->strend - locinput < ln)
4580     sayNO;
4581    if (ln > 1 && ! folder(s, locinput, ln))
4582     sayNO;
4583    locinput += ln;
4584    break;
4585   }
4586
4587   /* XXX Could improve efficiency by separating these all out using a
4588   * macro or in-line function.  At that point regcomp.c would no longer
4589   * have to set the FLAGS fields of these */
4590   case BOUNDL:  /*  /\b/l  */
4591   case NBOUNDL: /*  /\B/l  */
4592   case BOUND:   /*  /\b/   */
4593   case BOUNDU:  /*  /\b/u  */
4594   case BOUNDA:  /*  /\b/a  */
4595   case NBOUND:  /*  /\B/   */
4596   case NBOUNDU: /*  /\B/u  */
4597   case NBOUNDA: /*  /\B/a  */
4598    /* was last char in word? */
4599    if (utf8_target
4600     && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4601     && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4602    {
4603     if (locinput == reginfo->strbeg)
4604      ln = '\n';
4605     else {
4606      const U8 * const r =
4607        reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4608
4609      ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
4610                 0, uniflags);
4611     }
4612     if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4613      ln = isWORDCHAR_uni(ln);
4614      if (NEXTCHR_IS_EOS)
4615       n = 0;
4616      else {
4617       LOAD_UTF8_CHARCLASS_ALNUM();
4618       n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4619                 utf8_target);
4620      }
4621     }
4622     else {
4623      ln = isWORDCHAR_LC_uvchr(ln);
4624      n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4625     }
4626    }
4627    else {
4628
4629     /* Here the string isn't utf8, or is utf8 and only ascii
4630     * characters are to match \w.  In the latter case looking at
4631     * the byte just prior to the current one may be just the final
4632     * byte of a multi-byte character.  This is ok.  There are two
4633     * cases:
4634     * 1) it is a single byte character, and then the test is doing
4635     * just what it's supposed to.
4636     * 2) it is a multi-byte character, in which case the final
4637     * byte is never mistakable for ASCII, and so the test
4638     * will say it is not a word character, which is the
4639     * correct answer. */
4640     ln = (locinput != reginfo->strbeg) ?
4641      UCHARAT(locinput - 1) : '\n';
4642     switch (FLAGS(scan)) {
4643      case REGEX_UNICODE_CHARSET:
4644       ln = isWORDCHAR_L1(ln);
4645       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4646       break;
4647      case REGEX_LOCALE_CHARSET:
4648       ln = isWORDCHAR_LC(ln);
4649       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4650       break;
4651      case REGEX_DEPENDS_CHARSET:
4652       ln = isWORDCHAR(ln);
4653       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4654       break;
4655      case REGEX_ASCII_RESTRICTED_CHARSET:
4656      case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4657       ln = isWORDCHAR_A(ln);
4658       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4659       break;
4660      default:
4661       Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4662     }
4663    }
4664    /* Note requires that all BOUNDs be lower than all NBOUNDs in
4665    * regcomp.sym */
4666    if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4667      sayNO;
4668    break;
4669
4670   case ANYOF:  /*  /[abc]/       */
4671    if (NEXTCHR_IS_EOS)
4672     sayNO;
4673    if (utf8_target) {
4674     if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
4675                 utf8_target))
4676      sayNO;
4677     locinput += UTF8SKIP(locinput);
4678    }
4679    else {
4680     if (!REGINCLASS(rex, scan, (U8*)locinput))
4681      sayNO;
4682     locinput++;
4683    }
4684    break;
4685
4686   /* The argument (FLAGS) to all the POSIX node types is the class number
4687   * */
4688
4689   case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4690    to_complement = 1;
4691    /* FALLTHROUGH */
4692
4693   case POSIXL:    /* \w or [:punct:] etc. under /l */
4694    if (NEXTCHR_IS_EOS)
4695     sayNO;
4696
4697    /* Use isFOO_lc() for characters within Latin1.  (Note that
4698    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4699    * wouldn't be invariant) */
4700    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4701     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4702      sayNO;
4703     }
4704    }
4705    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4706     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4707           (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4708                *(locinput + 1))))))
4709     {
4710      sayNO;
4711     }
4712    }
4713    else { /* Here, must be an above Latin-1 code point */
4714     goto utf8_posix_not_eos;
4715    }
4716
4717    /* Here, must be utf8 */
4718    locinput += UTF8SKIP(locinput);
4719    break;
4720
4721   case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4722    to_complement = 1;
4723    /* FALLTHROUGH */
4724
4725   case POSIXD:    /* \w or [:punct:] etc. under /d */
4726    if (utf8_target) {
4727     goto utf8_posix;
4728    }
4729    goto posixa;
4730
4731   case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4732
4733    if (NEXTCHR_IS_EOS) {
4734     sayNO;
4735    }
4736
4737    /* All UTF-8 variants match */
4738    if (! UTF8_IS_INVARIANT(nextchr)) {
4739     goto increment_locinput;
4740    }
4741
4742    to_complement = 1;
4743    /* FALLTHROUGH */
4744
4745   case POSIXA:    /* \w or [:punct:] etc. under /a */
4746
4747   posixa:
4748    /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4749    * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4750    * character is a single byte */
4751
4752    if (NEXTCHR_IS_EOS
4753     || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4754                FLAGS(scan)))))
4755    {
4756     sayNO;
4757    }
4758
4759    /* Here we are either not in utf8, or we matched a utf8-invariant,
4760    * so the next char is the next byte */
4761    locinput++;
4762    break;
4763
4764   case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4765    to_complement = 1;
4766    /* FALLTHROUGH */
4767
4768   case POSIXU:    /* \w or [:punct:] etc. under /u */
4769   utf8_posix:
4770    if (NEXTCHR_IS_EOS) {
4771     sayNO;
4772    }
4773   utf8_posix_not_eos:
4774
4775    /* Use _generic_isCC() for characters within Latin1.  (Note that
4776    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4777    * wouldn't be invariant) */
4778    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4779     if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4780               FLAGS(scan)))))
4781     {
4782      sayNO;
4783     }
4784     locinput++;
4785    }
4786    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4787     if (! (to_complement
4788      ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4789                *(locinput + 1)),
4790            FLAGS(scan)))))
4791     {
4792      sayNO;
4793     }
4794     locinput += 2;
4795    }
4796    else {  /* Handle above Latin-1 code points */
4797     classnum = (_char_class_number) FLAGS(scan);
4798     if (classnum < _FIRST_NON_SWASH_CC) {
4799
4800      /* Here, uses a swash to find such code points.  Load if if
4801      * not done already */
4802      if (! PL_utf8_swash_ptrs[classnum]) {
4803       U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4804       PL_utf8_swash_ptrs[classnum]
4805         = _core_swash_init("utf8",
4806           "",
4807           &PL_sv_undef, 1, 0,
4808           PL_XPosix_ptrs[classnum], &flags);
4809      }
4810      if (! (to_complement
4811       ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4812            (U8 *) locinput, TRUE))))
4813      {
4814       sayNO;
4815      }
4816     }
4817     else {  /* Here, uses macros to find above Latin-1 code points */
4818      switch (classnum) {
4819       case _CC_ENUM_SPACE:    /* XXX would require separate
4820             code if we revert the change
4821             of \v matching this */
4822       case _CC_ENUM_PSXSPC:
4823        if (! (to_complement
4824           ^ cBOOL(is_XPERLSPACE_high(locinput))))
4825        {
4826         sayNO;
4827        }
4828        break;
4829       case _CC_ENUM_BLANK:
4830        if (! (to_complement
4831            ^ cBOOL(is_HORIZWS_high(locinput))))
4832        {
4833         sayNO;
4834        }
4835        break;
4836       case _CC_ENUM_XDIGIT:
4837        if (! (to_complement
4838            ^ cBOOL(is_XDIGIT_high(locinput))))
4839        {
4840         sayNO;
4841        }
4842        break;
4843       case _CC_ENUM_VERTSPACE:
4844        if (! (to_complement
4845            ^ cBOOL(is_VERTWS_high(locinput))))
4846        {
4847         sayNO;
4848        }
4849        break;
4850       default:    /* The rest, e.g. [:cntrl:], can't match
4851          above Latin1 */
4852        if (! to_complement) {
4853         sayNO;
4854        }
4855        break;
4856      }
4857     }
4858     locinput += UTF8SKIP(locinput);
4859    }
4860    break;
4861
4862   case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4863      a Unicode extended Grapheme Cluster */
4864    /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4865    extended Grapheme Cluster is:
4866
4867    CR LF
4868    | Prepend* Begin Extend*
4869    | .
4870
4871    Begin is:           ( Special_Begin | ! Control )
4872    Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4873    Extend is:          ( Grapheme_Extend | Spacing_Mark )
4874    Control is:         [ GCB_Control | CR | LF ]
4875    Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4876
4877    If we create a 'Regular_Begin' = Begin - Special_Begin, then
4878    we can rewrite
4879
4880     Begin is ( Regular_Begin + Special Begin )
4881
4882    It turns out that 98.4% of all Unicode code points match
4883    Regular_Begin.  Doing it this way eliminates a table match in
4884    the previous implementation for almost all Unicode code points.
4885
4886    There is a subtlety with Prepend* which showed up in testing.
4887    Note that the Begin, and only the Begin is required in:
4888     | Prepend* Begin Extend*
4889    Also, Begin contains '! Control'.  A Prepend must be a
4890    '!  Control', which means it must also be a Begin.  What it
4891    comes down to is that if we match Prepend* and then find no
4892    suitable Begin afterwards, that if we backtrack the last
4893    Prepend, that one will be a suitable Begin.
4894    */
4895
4896    if (NEXTCHR_IS_EOS)
4897     sayNO;
4898    if  (! utf8_target) {
4899
4900     /* Match either CR LF  or '.', as all the other possibilities
4901     * require utf8 */
4902     locinput++;     /* Match the . or CR */
4903     if (nextchr == '\r' /* And if it was CR, and the next is LF,
4904          match the LF */
4905      && locinput < reginfo->strend
4906      && UCHARAT(locinput) == '\n')
4907     {
4908      locinput++;
4909     }
4910    }
4911    else {
4912
4913     /* Utf8: See if is ( CR LF ); already know that locinput <
4914     * reginfo->strend, so locinput+1 is in bounds */
4915     if ( nextchr == '\r' && locinput+1 < reginfo->strend
4916      && UCHARAT(locinput + 1) == '\n')
4917     {
4918      locinput += 2;
4919     }
4920     else {
4921      STRLEN len;
4922
4923      /* In case have to backtrack to beginning, then match '.' */
4924      char *starting = locinput;
4925
4926      /* In case have to backtrack the last prepend */
4927      char *previous_prepend = NULL;
4928
4929      LOAD_UTF8_CHARCLASS_GCB();
4930
4931      /* Match (prepend)*   */
4932      while (locinput < reginfo->strend
4933       && (len = is_GCB_Prepend_utf8(locinput)))
4934      {
4935       previous_prepend = locinput;
4936       locinput += len;
4937      }
4938
4939      /* As noted above, if we matched a prepend character, but
4940      * the next thing won't match, back off the last prepend we
4941      * matched, as it is guaranteed to match the begin */
4942      if (previous_prepend
4943       && (locinput >=  reginfo->strend
4944        || (! swash_fetch(PL_utf8_X_regular_begin,
4945            (U8*)locinput, utf8_target)
4946         && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4947       )
4948      {
4949       locinput = previous_prepend;
4950      }
4951
4952      /* Note that here we know reginfo->strend > locinput, as we
4953      * tested that upon input to this switch case, and if we
4954      * moved locinput forward, we tested the result just above
4955      * and it either passed, or we backed off so that it will
4956      * now pass */
4957      if (swash_fetch(PL_utf8_X_regular_begin,
4958          (U8*)locinput, utf8_target)) {
4959       locinput += UTF8SKIP(locinput);
4960      }
4961      else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4962
4963       /* Here did not match the required 'Begin' in the
4964       * second term.  So just match the very first
4965       * character, the '.' of the final term of the regex */
4966       locinput = starting + UTF8SKIP(starting);
4967       goto exit_utf8;
4968      } else {
4969
4970       /* Here is a special begin.  It can be composed of
4971       * several individual characters.  One possibility is
4972       * RI+ */
4973       if ((len = is_GCB_RI_utf8(locinput))) {
4974        locinput += len;
4975        while (locinput < reginfo->strend
4976         && (len = is_GCB_RI_utf8(locinput)))
4977        {
4978         locinput += len;
4979        }
4980       } else if ((len = is_GCB_T_utf8(locinput))) {
4981        /* Another possibility is T+ */
4982        locinput += len;
4983        while (locinput < reginfo->strend
4984         && (len = is_GCB_T_utf8(locinput)))
4985        {
4986         locinput += len;
4987        }
4988       } else {
4989
4990        /* Here, neither RI+ nor T+; must be some other
4991        * Hangul.  That means it is one of the others: L,
4992        * LV, LVT or V, and matches:
4993        * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4994
4995        /* Match L*           */
4996        while (locinput < reginfo->strend
4997         && (len = is_GCB_L_utf8(locinput)))
4998        {
4999         locinput += len;
5000        }
5001
5002        /* Here, have exhausted L*.  If the next character
5003        * is not an LV, LVT nor V, it means we had to have
5004        * at least one L, so matches L+ in the original
5005        * equation, we have a complete hangul syllable.
5006        * Are done. */
5007
5008        if (locinput < reginfo->strend
5009         && is_GCB_LV_LVT_V_utf8(locinput))
5010        {
5011         /* Otherwise keep going.  Must be LV, LVT or V.
5012         * See if LVT, by first ruling out V, then LV */
5013         if (! is_GCB_V_utf8(locinput)
5014           /* All but every TCount one is LV */
5015          && (valid_utf8_to_uvchr((U8 *) locinput,
5016                   NULL)
5017                   - SBASE)
5018           % TCount != 0)
5019         {
5020          locinput += UTF8SKIP(locinput);
5021         } else {
5022
5023          /* Must be  V or LV.  Take it, then match
5024          * V*     */
5025          locinput += UTF8SKIP(locinput);
5026          while (locinput < reginfo->strend
5027           && (len = is_GCB_V_utf8(locinput)))
5028          {
5029           locinput += len;
5030          }
5031         }
5032
5033         /* And any of LV, LVT, or V can be followed
5034         * by T*            */
5035         while (locinput < reginfo->strend
5036          && (len = is_GCB_T_utf8(locinput)))
5037         {
5038          locinput += len;
5039         }
5040        }
5041       }
5042      }
5043
5044      /* Match any extender */
5045      while (locinput < reginfo->strend
5046        && swash_fetch(PL_utf8_X_extend,
5047            (U8*)locinput, utf8_target))
5048      {
5049       locinput += UTF8SKIP(locinput);
5050      }
5051     }
5052    exit_utf8:
5053     if (locinput > reginfo->strend) sayNO;
5054    }
5055    break;
5056
5057   case NREFFL:  /*  /\g{name}/il  */
5058   {   /* The capture buffer cases.  The ones beginning with N for the
5059    named buffers just convert to the equivalent numbered and
5060    pretend they were called as the corresponding numbered buffer
5061    op.  */
5062    /* don't initialize these in the declaration, it makes C++
5063    unhappy */
5064    const char *s;
5065    char type;
5066    re_fold_t folder;
5067    const U8 *fold_array;
5068    UV utf8_fold_flags;
5069
5070    folder = foldEQ_locale;
5071    fold_array = PL_fold_locale;
5072    type = REFFL;
5073    utf8_fold_flags = FOLDEQ_LOCALE;
5074    goto do_nref;
5075
5076   case NREFFA:  /*  /\g{name}/iaa  */
5077    folder = foldEQ_latin1;
5078    fold_array = PL_fold_latin1;
5079    type = REFFA;
5080    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5081    goto do_nref;
5082
5083   case NREFFU:  /*  /\g{name}/iu  */
5084    folder = foldEQ_latin1;
5085    fold_array = PL_fold_latin1;
5086    type = REFFU;
5087    utf8_fold_flags = 0;
5088    goto do_nref;
5089
5090   case NREFF:  /*  /\g{name}/i  */
5091    folder = foldEQ;
5092    fold_array = PL_fold;
5093    type = REFF;
5094    utf8_fold_flags = 0;
5095    goto do_nref;
5096
5097   case NREF:  /*  /\g{name}/   */
5098    type = REF;
5099    folder = NULL;
5100    fold_array = NULL;
5101    utf8_fold_flags = 0;
5102   do_nref:
5103
5104    /* For the named back references, find the corresponding buffer
5105    * number */
5106    n = reg_check_named_buff_matched(rex,scan);
5107
5108    if ( ! n ) {
5109     sayNO;
5110    }
5111    goto do_nref_ref_common;
5112
5113   case REFFL:  /*  /\1/il  */
5114    folder = foldEQ_locale;
5115    fold_array = PL_fold_locale;
5116    utf8_fold_flags = FOLDEQ_LOCALE;
5117    goto do_ref;
5118
5119   case REFFA:  /*  /\1/iaa  */
5120    folder = foldEQ_latin1;
5121    fold_array = PL_fold_latin1;
5122    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5123    goto do_ref;
5124
5125   case REFFU:  /*  /\1/iu  */
5126    folder = foldEQ_latin1;
5127    fold_array = PL_fold_latin1;
5128    utf8_fold_flags = 0;
5129    goto do_ref;
5130
5131   case REFF:  /*  /\1/i  */
5132    folder = foldEQ;
5133    fold_array = PL_fold;
5134    utf8_fold_flags = 0;
5135    goto do_ref;
5136
5137   case REF:  /*  /\1/    */
5138    folder = NULL;
5139    fold_array = NULL;
5140    utf8_fold_flags = 0;
5141
5142   do_ref:
5143    type = OP(scan);
5144    n = ARG(scan);  /* which paren pair */
5145
5146   do_nref_ref_common:
5147    ln = rex->offs[n].start;
5148    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5149    if (rex->lastparen < n || ln == -1)
5150     sayNO;   /* Do not match unless seen CLOSEn. */
5151    if (ln == rex->offs[n].end)
5152     break;
5153
5154    s = reginfo->strbeg + ln;
5155    if (type != REF /* REF can do byte comparison */
5156     && (utf8_target || type == REFFU || type == REFFL))
5157    {
5158     char * limit = reginfo->strend;
5159
5160     /* This call case insensitively compares the entire buffer
5161      * at s, with the current input starting at locinput, but
5162      * not going off the end given by reginfo->strend, and
5163      * returns in <limit> upon success, how much of the
5164      * current input was matched */
5165     if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
5166          locinput, &limit, 0, utf8_target, utf8_fold_flags))
5167     {
5168      sayNO;
5169     }
5170     locinput = limit;
5171     break;
5172    }
5173
5174    /* Not utf8:  Inline the first character, for speed. */
5175    if (!NEXTCHR_IS_EOS &&
5176     UCHARAT(s) != nextchr &&
5177     (type == REF ||
5178     UCHARAT(s) != fold_array[nextchr]))
5179     sayNO;
5180    ln = rex->offs[n].end - ln;
5181    if (locinput + ln > reginfo->strend)
5182     sayNO;
5183    if (ln > 1 && (type == REF
5184       ? memNE(s, locinput, ln)
5185       : ! folder(s, locinput, ln)))
5186     sayNO;
5187    locinput += ln;
5188    break;
5189   }
5190
5191   case NOTHING: /* null op; e.g. the 'nothing' following
5192      * the '*' in m{(a+|b)*}' */
5193    break;
5194   case TAIL: /* placeholder while compiling (A|B|C) */
5195    break;
5196
5197   case BACK: /* ??? doesn't appear to be used ??? */
5198    break;
5199
5200 #undef  ST
5201 #define ST st->u.eval
5202   {
5203    SV *ret;
5204    REGEXP *re_sv;
5205    regexp *re;
5206    regexp_internal *rei;
5207    regnode *startpoint;
5208
5209   case GOSTART: /*  (?R)  */
5210   case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
5211    if (cur_eval && cur_eval->locinput==locinput) {
5212     if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
5213      Perl_croak(aTHX_ "Infinite recursion in regex");
5214     if ( ++nochange_depth > max_nochange_depth )
5215      Perl_croak(aTHX_
5216       "Pattern subroutine nesting without pos change"
5217       " exceeded limit in regex");
5218    } else {
5219     nochange_depth = 0;
5220    }
5221    re_sv = rex_sv;
5222    re = rex;
5223    rei = rexi;
5224    if (OP(scan)==GOSUB) {
5225     startpoint = scan + ARG2L(scan);
5226     ST.close_paren = ARG(scan);
5227    } else {
5228     startpoint = rei->program+1;
5229     ST.close_paren = 0;
5230    }
5231
5232    /* Save all the positions seen so far. */
5233    ST.cp = regcppush(rex, 0, maxopenparen);
5234    REGCP_SET(ST.lastcp);
5235
5236    /* and then jump to the code we share with EVAL */
5237    goto eval_recurse_doit;
5238
5239    /* NOTREACHED */
5240    assert(0);
5241
5242   case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
5243    if (cur_eval && cur_eval->locinput==locinput) {
5244     if ( ++nochange_depth > max_nochange_depth )
5245      Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
5246    } else {
5247     nochange_depth = 0;
5248    }
5249    {
5250     /* execute the code in the {...} */
5251
5252     dSP;
5253     IV before;
5254     OP * const oop = PL_op;
5255     COP * const ocurcop = PL_curcop;
5256     OP *nop;
5257     CV *newcv;
5258
5259     /* save *all* paren positions */
5260     regcppush(rex, 0, maxopenparen);
5261     REGCP_SET(runops_cp);
5262
5263     if (!caller_cv)
5264      caller_cv = find_runcv(NULL);
5265
5266     n = ARG(scan);
5267
5268     if (rexi->data->what[n] == 'r') { /* code from an external qr */
5269      newcv = (ReANY(
5270             (REGEXP*)(rexi->data->data[n])
5271            ))->qr_anoncv
5272           ;
5273      nop = (OP*)rexi->data->data[n+1];
5274     }
5275     else if (rexi->data->what[n] == 'l') { /* literal code */
5276      newcv = caller_cv;
5277      nop = (OP*)rexi->data->data[n];
5278      assert(CvDEPTH(newcv));
5279     }
5280     else {
5281      /* literal with own CV */
5282      assert(rexi->data->what[n] == 'L');
5283      newcv = rex->qr_anoncv;
5284      nop = (OP*)rexi->data->data[n];
5285     }
5286
5287     /* normally if we're about to execute code from the same
5288     * CV that we used previously, we just use the existing
5289     * CX stack entry. However, its possible that in the
5290     * meantime we may have backtracked, popped from the save
5291     * stack, and undone the SAVECOMPPAD(s) associated with
5292     * PUSH_MULTICALL; in which case PL_comppad no longer
5293     * points to newcv's pad. */
5294     if (newcv != last_pushed_cv || PL_comppad != last_pad)
5295     {
5296      U8 flags = (CXp_SUB_RE |
5297         ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
5298      if (last_pushed_cv) {
5299       CHANGE_MULTICALL_FLAGS(newcv, flags);
5300      }
5301      else {
5302       PUSH_MULTICALL_FLAGS(newcv, flags);
5303      }
5304      last_pushed_cv = newcv;
5305     }
5306     else {
5307      /* these assignments are just to silence compiler
5308      * warnings */
5309      multicall_cop = NULL;
5310      newsp = NULL;
5311     }
5312     last_pad = PL_comppad;
5313
5314     /* the initial nextstate you would normally execute
5315     * at the start of an eval (which would cause error
5316     * messages to come from the eval), may be optimised
5317     * away from the execution path in the regex code blocks;
5318     * so manually set PL_curcop to it initially */
5319     {
5320      OP *o = cUNOPx(nop)->op_first;
5321      assert(o->op_type == OP_NULL);
5322      if (o->op_targ == OP_SCOPE) {
5323       o = cUNOPo->op_first;
5324      }
5325      else {
5326       assert(o->op_targ == OP_LEAVE);
5327       o = cUNOPo->op_first;
5328       assert(o->op_type == OP_ENTER);
5329       o = OP_SIBLING(o);
5330      }
5331
5332      if (o->op_type != OP_STUB) {
5333       assert(    o->op_type == OP_NEXTSTATE
5334         || o->op_type == OP_DBSTATE
5335         || (o->op_type == OP_NULL
5336          &&  (  o->op_targ == OP_NEXTSTATE
5337           || o->op_targ == OP_DBSTATE
5338           )
5339          )
5340       );
5341       PL_curcop = (COP*)o;
5342      }
5343     }
5344     nop = nop->op_next;
5345
5346     DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
5347      "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5348
5349     rex->offs[0].end = locinput - reginfo->strbeg;
5350     if (reginfo->info_aux_eval->pos_magic)
5351      MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5352         reginfo->sv, reginfo->strbeg,
5353         locinput - reginfo->strbeg);
5354
5355     if (sv_yes_mark) {
5356      SV *sv_mrk = get_sv("REGMARK", 1);
5357      sv_setsv(sv_mrk, sv_yes_mark);
5358     }
5359
5360     /* we don't use MULTICALL here as we want to call the
5361     * first op of the block of interest, rather than the
5362     * first op of the sub */
5363     before = (IV)(SP-PL_stack_base);
5364     PL_op = nop;
5365     CALLRUNOPS(aTHX);   /* Scalar context. */
5366     SPAGAIN;
5367     if ((IV)(SP-PL_stack_base) == before)
5368      ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
5369     else {
5370      ret = POPs;
5371      PUTBACK;
5372     }
5373
5374     /* before restoring everything, evaluate the returned
5375     * value, so that 'uninit' warnings don't use the wrong
5376     * PL_op or pad. Also need to process any magic vars
5377     * (e.g. $1) *before* parentheses are restored */
5378
5379     PL_op = NULL;
5380
5381     re_sv = NULL;
5382     if (logical == 0)        /*   (?{})/   */
5383      sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5384     else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
5385      sw = cBOOL(SvTRUE(ret));
5386      logical = 0;
5387     }
5388     else {                   /*  /(??{})  */
5389      /*  if its overloaded, let the regex compiler handle
5390      *  it; otherwise extract regex, or stringify  */
5391      if (SvGMAGICAL(ret))
5392       ret = sv_mortalcopy(ret);
5393      if (!SvAMAGIC(ret)) {
5394       SV *sv = ret;
5395       if (SvROK(sv))
5396        sv = SvRV(sv);
5397       if (SvTYPE(sv) == SVt_REGEXP)
5398        re_sv = (REGEXP*) sv;
5399       else if (SvSMAGICAL(ret)) {
5400        MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
5401        if (mg)
5402         re_sv = (REGEXP *) mg->mg_obj;
5403       }
5404
5405       /* force any undef warnings here */
5406       if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
5407        ret = sv_mortalcopy(ret);
5408        (void) SvPV_force_nolen(ret);
5409       }
5410      }
5411
5412     }
5413
5414     /* *** Note that at this point we don't restore
5415     * PL_comppad, (or pop the CxSUB) on the assumption it may
5416     * be used again soon. This is safe as long as nothing
5417     * in the regexp code uses the pad ! */
5418     PL_op = oop;
5419     PL_curcop = ocurcop;
5420     S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5421     PL_curpm = PL_reg_curpm;
5422
5423     if (logical != 2)
5424      break;
5425    }
5426
5427     /* only /(??{})/  from now on */
5428     logical = 0;
5429     {
5430      /* extract RE object from returned value; compiling if
5431      * necessary */
5432
5433      if (re_sv) {
5434       re_sv = reg_temp_copy(NULL, re_sv);
5435      }
5436      else {
5437       U32 pm_flags = 0;
5438
5439       if (SvUTF8(ret) && IN_BYTES) {
5440        /* In use 'bytes': make a copy of the octet
5441        * sequence, but without the flag on */
5442        STRLEN len;
5443        const char *const p = SvPV(ret, len);
5444        ret = newSVpvn_flags(p, len, SVs_TEMP);
5445       }
5446       if (rex->intflags & PREGf_USE_RE_EVAL)
5447        pm_flags |= PMf_USE_RE_EVAL;
5448
5449       /* if we got here, it should be an engine which
5450       * supports compiling code blocks and stuff */
5451       assert(rex->engine && rex->engine->op_comp);
5452       assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5453       re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5454          rex->engine, NULL, NULL,
5455          /* copy /msix etc to inner pattern */
5456          scan->flags,
5457          pm_flags);
5458
5459       if (!(SvFLAGS(ret)
5460        & (SVs_TEMP | SVs_GMG | SVf_ROK))
5461       && (!SvPADTMP(ret) || SvREADONLY(ret))) {
5462        /* This isn't a first class regexp. Instead, it's
5463        caching a regexp onto an existing, Perl visible
5464        scalar.  */
5465        sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5466       }
5467      }
5468      SAVEFREESV(re_sv);
5469      re = ReANY(re_sv);
5470     }
5471     RXp_MATCH_COPIED_off(re);
5472     re->subbeg = rex->subbeg;
5473     re->sublen = rex->sublen;
5474     re->suboffset = rex->suboffset;
5475     re->subcoffset = rex->subcoffset;
5476     re->lastparen = 0;
5477     re->lastcloseparen = 0;
5478     rei = RXi_GET(re);
5479     DEBUG_EXECUTE_r(
5480      debug_start_match(re_sv, utf8_target, locinput,
5481          reginfo->strend, "Matching embedded");
5482     );
5483     startpoint = rei->program + 1;
5484      ST.close_paren = 0; /* only used for GOSUB */
5485     /* Save all the seen positions so far. */
5486     ST.cp = regcppush(rex, 0, maxopenparen);
5487     REGCP_SET(ST.lastcp);
5488     /* and set maxopenparen to 0, since we are starting a "fresh" match */
5489     maxopenparen = 0;
5490     /* run the pattern returned from (??{...}) */
5491
5492   eval_recurse_doit: /* Share code with GOSUB below this line
5493        * At this point we expect the stack context to be
5494        * set up correctly */
5495
5496     /* invalidate the S-L poscache. We're now executing a
5497     * different set of WHILEM ops (and their associated
5498     * indexes) against the same string, so the bits in the
5499     * cache are meaningless. Setting maxiter to zero forces
5500     * the cache to be invalidated and zeroed before reuse.
5501     * XXX This is too dramatic a measure. Ideally we should
5502     * save the old cache and restore when running the outer
5503     * pattern again */
5504     reginfo->poscache_maxiter = 0;
5505
5506     /* the new regexp might have a different is_utf8_pat than we do */
5507     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5508
5509     ST.prev_rex = rex_sv;
5510     ST.prev_curlyx = cur_curlyx;
5511     rex_sv = re_sv;
5512     SET_reg_curpm(rex_sv);
5513     rex = re;
5514     rexi = rei;
5515     cur_curlyx = NULL;
5516     ST.B = next;
5517     ST.prev_eval = cur_eval;
5518     cur_eval = st;
5519     /* now continue from first node in postoned RE */
5520     PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5521     /* NOTREACHED */
5522     assert(0);
5523   }
5524
5525   case EVAL_AB: /* cleanup after a successful (??{A})B */
5526    /* note: this is called twice; first after popping B, then A */
5527    rex_sv = ST.prev_rex;
5528    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5529    SET_reg_curpm(rex_sv);
5530    rex = ReANY(rex_sv);
5531    rexi = RXi_GET(rex);
5532    {
5533     /* preserve $^R across LEAVE's. See Bug 121070. */
5534     SV *save_sv= GvSV(PL_replgv);
5535     SvREFCNT_inc(save_sv);
5536     regcpblow(ST.cp); /* LEAVE in disguise */
5537     sv_setsv(GvSV(PL_replgv), save_sv);
5538     SvREFCNT_dec(save_sv);
5539    }
5540    cur_eval = ST.prev_eval;
5541    cur_curlyx = ST.prev_curlyx;
5542
5543    /* Invalidate cache. See "invalidate" comment above. */
5544    reginfo->poscache_maxiter = 0;
5545    if ( nochange_depth )
5546     nochange_depth--;
5547    sayYES;
5548
5549
5550   case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5551    /* note: this is called twice; first after popping B, then A */
5552    rex_sv = ST.prev_rex;
5553    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5554    SET_reg_curpm(rex_sv);
5555    rex = ReANY(rex_sv);
5556    rexi = RXi_GET(rex);
5557
5558    REGCP_UNWIND(ST.lastcp);
5559    regcppop(rex, &maxopenparen);
5560    cur_eval = ST.prev_eval;
5561    cur_curlyx = ST.prev_curlyx;
5562    /* Invalidate cache. See "invalidate" comment above. */
5563    reginfo->poscache_maxiter = 0;
5564    if ( nochange_depth )
5565     nochange_depth--;
5566    sayNO_SILENT;
5567 #undef ST
5568
5569   case OPEN: /*  (  */
5570    n = ARG(scan);  /* which paren pair */
5571    rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5572    if (n > maxopenparen)
5573     maxopenparen = n;
5574    DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5575     "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5576     PTR2UV(rex),
5577     PTR2UV(rex->offs),
5578     (UV)n,
5579     (IV)rex->offs[n].start_tmp,
5580     (UV)maxopenparen
5581    ));
5582    lastopen = n;
5583    break;
5584
5585 /* XXX really need to log other places start/end are set too */
5586 #define CLOSE_CAPTURE \
5587  rex->offs[n].start = rex->offs[n].start_tmp; \
5588  rex->offs[n].end = locinput - reginfo->strbeg; \
5589  DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5590   "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5591   PTR2UV(rex), \
5592   PTR2UV(rex->offs), \
5593   (UV)n, \
5594   (IV)rex->offs[n].start, \
5595   (IV)rex->offs[n].end \
5596  ))
5597
5598   case CLOSE:  /*  )  */
5599    n = ARG(scan);  /* which paren pair */
5600    CLOSE_CAPTURE;
5601    if (n > rex->lastparen)
5602     rex->lastparen = n;
5603    rex->lastcloseparen = n;
5604    if (cur_eval && cur_eval->u.eval.close_paren == n) {
5605     goto fake_end;
5606    }
5607    break;
5608
5609   case ACCEPT:  /*  (*ACCEPT)  */
5610    if (ARG(scan)){
5611     regnode *cursor;
5612     for (cursor=scan;
5613      cursor && OP(cursor)!=END;
5614      cursor=regnext(cursor))
5615     {
5616      if ( OP(cursor)==CLOSE ){
5617       n = ARG(cursor);
5618       if ( n <= lastopen ) {
5619        CLOSE_CAPTURE;
5620        if (n > rex->lastparen)
5621         rex->lastparen = n;
5622        rex->lastcloseparen = n;
5623        if ( n == ARG(scan) || (cur_eval &&
5624         cur_eval->u.eval.close_paren == n))
5625         break;
5626       }
5627      }
5628     }
5629    }
5630    goto fake_end;
5631    /* NOTREACHED */
5632
5633   case GROUPP:  /*  (?(1))  */
5634    n = ARG(scan);  /* which paren pair */
5635    sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5636    break;
5637
5638   case NGROUPP:  /*  (?(<name>))  */
5639    /* reg_check_named_buff_matched returns 0 for no match */
5640    sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5641    break;
5642
5643   case INSUBP:   /*  (?(R))  */
5644    n = ARG(scan);
5645    sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5646    break;
5647
5648   case DEFINEP:  /*  (?(DEFINE))  */
5649    sw = 0;
5650    break;
5651
5652   case IFTHEN:   /*  (?(cond)A|B)  */
5653    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5654    if (sw)
5655     next = NEXTOPER(NEXTOPER(scan));
5656    else {
5657     next = scan + ARG(scan);
5658     if (OP(next) == IFTHEN) /* Fake one. */
5659      next = NEXTOPER(NEXTOPER(next));
5660    }
5661    break;
5662
5663   case LOGICAL:  /* modifier for EVAL and IFMATCH */
5664    logical = scan->flags;
5665    break;
5666
5667 /*******************************************************************
5668
5669 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5670 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5671 STAR/PLUS/CURLY/CURLYN are used instead.)
5672
5673 A*B is compiled as <CURLYX><A><WHILEM><B>
5674
5675 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5676 state, which contains the current count, initialised to -1. It also sets
5677 cur_curlyx to point to this state, with any previous value saved in the
5678 state block.
5679
5680 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5681 since the pattern may possibly match zero times (i.e. it's a while {} loop
5682 rather than a do {} while loop).
5683
5684 Each entry to WHILEM represents a successful match of A. The count in the
5685 CURLYX block is incremented, another WHILEM state is pushed, and execution
5686 passes to A or B depending on greediness and the current count.
5687
5688 For example, if matching against the string a1a2a3b (where the aN are
5689 substrings that match /A/), then the match progresses as follows: (the
5690 pushed states are interspersed with the bits of strings matched so far):
5691
5692  <CURLYX cnt=-1>
5693  <CURLYX cnt=0><WHILEM>
5694  <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5695  <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5696  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5697  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5698
5699 (Contrast this with something like CURLYM, which maintains only a single
5700 backtrack state:
5701
5702  <CURLYM cnt=0> a1
5703  a1 <CURLYM cnt=1> a2
5704  a1 a2 <CURLYM cnt=2> a3
5705  a1 a2 a3 <CURLYM cnt=3> b
5706 )
5707
5708 Each WHILEM state block marks a point to backtrack to upon partial failure
5709 of A or B, and also contains some minor state data related to that
5710 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5711 overall state, such as the count, and pointers to the A and B ops.
5712
5713 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5714 must always point to the *current* CURLYX block, the rules are:
5715
5716 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5717 and set cur_curlyx to point the new block.
5718
5719 When popping the CURLYX block after a successful or unsuccessful match,
5720 restore the previous cur_curlyx.
5721
5722 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5723 to the outer one saved in the CURLYX block.
5724
5725 When popping the WHILEM block after a successful or unsuccessful B match,
5726 restore the previous cur_curlyx.
5727
5728 Here's an example for the pattern (AI* BI)*BO
5729 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5730
5731 cur_
5732 curlyx backtrack stack
5733 ------ ---------------
5734 NULL
5735 CO     <CO prev=NULL> <WO>
5736 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5737 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5738 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5739
5740 At this point the pattern succeeds, and we work back down the stack to
5741 clean up, restoring as we go:
5742
5743 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5744 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5745 CO     <CO prev=NULL> <WO>
5746 NULL
5747
5748 *******************************************************************/
5749
5750 #define ST st->u.curlyx
5751
5752   case CURLYX:    /* start of /A*B/  (for complex A) */
5753   {
5754    /* No need to save/restore up to this paren */
5755    I32 parenfloor = scan->flags;
5756
5757    assert(next); /* keep Coverity happy */
5758    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5759     next += ARG(next);
5760
5761    /* XXXX Probably it is better to teach regpush to support
5762    parenfloor > maxopenparen ... */
5763    if (parenfloor > (I32)rex->lastparen)
5764     parenfloor = rex->lastparen; /* Pessimization... */
5765
5766    ST.prev_curlyx= cur_curlyx;
5767    cur_curlyx = st;
5768    ST.cp = PL_savestack_ix;
5769
5770    /* these fields contain the state of the current curly.
5771    * they are accessed by subsequent WHILEMs */
5772    ST.parenfloor = parenfloor;
5773    ST.me = scan;
5774    ST.B = next;
5775    ST.minmod = minmod;
5776    minmod = 0;
5777    ST.count = -1; /* this will be updated by WHILEM */
5778    ST.lastloc = NULL;  /* this will be updated by WHILEM */
5779
5780    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5781    /* NOTREACHED */
5782    assert(0);
5783   }
5784
5785   case CURLYX_end: /* just finished matching all of A*B */
5786    cur_curlyx = ST.prev_curlyx;
5787    sayYES;
5788    /* NOTREACHED */
5789    assert(0);
5790
5791   case CURLYX_end_fail: /* just failed to match all of A*B */
5792    regcpblow(ST.cp);
5793    cur_curlyx = ST.prev_curlyx;
5794    sayNO;
5795    /* NOTREACHED */
5796    assert(0);
5797
5798
5799 #undef ST
5800 #define ST st->u.whilem
5801
5802   case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5803   {
5804    /* see the discussion above about CURLYX/WHILEM */
5805    I32 n;
5806    int min, max;
5807    regnode *A;
5808
5809    assert(cur_curlyx); /* keep Coverity happy */
5810
5811    min = ARG1(cur_curlyx->u.curlyx.me);
5812    max = ARG2(cur_curlyx->u.curlyx.me);
5813    A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5814    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5815    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5816    ST.cache_offset = 0;
5817    ST.cache_mask = 0;
5818
5819
5820    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5821     "%*s  whilem: matched %ld out of %d..%d\n",
5822     REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5823    );
5824
5825    /* First just match a string of min A's. */
5826
5827    if (n < min) {
5828     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5829          maxopenparen);
5830     cur_curlyx->u.curlyx.lastloc = locinput;
5831     REGCP_SET(ST.lastcp);
5832
5833     PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5834     /* NOTREACHED */
5835     assert(0);
5836    }
5837
5838    /* If degenerate A matches "", assume A done. */
5839
5840    if (locinput == cur_curlyx->u.curlyx.lastloc) {
5841     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5842     "%*s  whilem: empty match detected, trying continuation...\n",
5843     REPORT_CODE_OFF+depth*2, "")
5844     );
5845     goto do_whilem_B_max;
5846    }
5847
5848    /* super-linear cache processing.
5849    *
5850    * The idea here is that for certain types of CURLYX/WHILEM -
5851    * principally those whose upper bound is infinity (and
5852    * excluding regexes that have things like \1 and other very
5853    * non-regular expresssiony things), then if a pattern like
5854    * /....A*.../ fails and we backtrack to the WHILEM, then we
5855    * make a note that this particular WHILEM op was at string
5856    * position 47 (say) when the rest of pattern failed. Then, if
5857    * we ever find ourselves back at that WHILEM, and at string
5858    * position 47 again, we can just fail immediately rather than
5859    * running the rest of the pattern again.
5860    *
5861    * This is very handy when patterns start to go
5862    * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5863    * with a combinatorial explosion of backtracking.
5864    *
5865    * The cache is implemented as a bit array, with one bit per
5866    * string byte position per WHILEM op (up to 16) - so its
5867    * between 0.25 and 2x the string size.
5868    *
5869    * To avoid allocating a poscache buffer every time, we do an
5870    * initially countdown; only after we have  executed a WHILEM
5871    * op (string-length x #WHILEMs) times do we allocate the
5872    * cache.
5873    *
5874    * The top 4 bits of scan->flags byte say how many different
5875    * relevant CURLLYX/WHILEM op pairs there are, while the
5876    * bottom 4-bits is the identifying index number of this
5877    * WHILEM.
5878    */
5879
5880    if (scan->flags) {
5881
5882     if (!reginfo->poscache_maxiter) {
5883      /* start the countdown: Postpone detection until we
5884      * know the match is not *that* much linear. */
5885      reginfo->poscache_maxiter
5886       =    (reginfo->strend - reginfo->strbeg + 1)
5887       * (scan->flags>>4);
5888      /* possible overflow for long strings and many CURLYX's */
5889      if (reginfo->poscache_maxiter < 0)
5890       reginfo->poscache_maxiter = I32_MAX;
5891      reginfo->poscache_iter = reginfo->poscache_maxiter;
5892     }
5893
5894     if (reginfo->poscache_iter-- == 0) {
5895      /* initialise cache */
5896      const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
5897      regmatch_info_aux *const aux = reginfo->info_aux;
5898      if (aux->poscache) {
5899       if ((SSize_t)reginfo->poscache_size < size) {
5900        Renew(aux->poscache, size, char);
5901        reginfo->poscache_size = size;
5902       }
5903       Zero(aux->poscache, size, char);
5904      }
5905      else {
5906       reginfo->poscache_size = size;
5907       Newxz(aux->poscache, size, char);
5908      }
5909      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5910  "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5911        PL_colors[4], PL_colors[5])
5912      );
5913     }
5914
5915     if (reginfo->poscache_iter < 0) {
5916      /* have we already failed at this position? */
5917      SSize_t offset, mask;
5918
5919      reginfo->poscache_iter = -1; /* stop eventual underflow */
5920      offset  = (scan->flags & 0xf) - 1
5921         +   (locinput - reginfo->strbeg)
5922         * (scan->flags>>4);
5923      mask    = 1 << (offset % 8);
5924      offset /= 8;
5925      if (reginfo->info_aux->poscache[offset] & mask) {
5926       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5927        "%*s  whilem: (cache) already tried at this position...\n",
5928        REPORT_CODE_OFF+depth*2, "")
5929       );
5930       sayNO; /* cache records failure */
5931      }
5932      ST.cache_offset = offset;
5933      ST.cache_mask   = mask;
5934     }
5935    }
5936
5937    /* Prefer B over A for minimal matching. */
5938
5939    if (cur_curlyx->u.curlyx.minmod) {
5940     ST.save_curlyx = cur_curlyx;
5941     cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5942     ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5943        maxopenparen);
5944     REGCP_SET(ST.lastcp);
5945     PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5946          locinput);
5947     /* NOTREACHED */
5948     assert(0);
5949    }
5950
5951    /* Prefer A over B for maximal matching. */
5952
5953    if (n < max) { /* More greed allowed? */
5954     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5955        maxopenparen);
5956     cur_curlyx->u.curlyx.lastloc = locinput;
5957     REGCP_SET(ST.lastcp);
5958     PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5959     /* NOTREACHED */
5960     assert(0);
5961    }
5962    goto do_whilem_B_max;
5963   }
5964   /* NOTREACHED */
5965   assert(0);
5966
5967   case WHILEM_B_min: /* just matched B in a minimal match */
5968   case WHILEM_B_max: /* just matched B in a maximal match */
5969    cur_curlyx = ST.save_curlyx;
5970    sayYES;
5971    /* NOTREACHED */
5972    assert(0);
5973
5974   case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5975    cur_curlyx = ST.save_curlyx;
5976    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5977    cur_curlyx->u.curlyx.count--;
5978    CACHEsayNO;
5979    /* NOTREACHED */
5980    assert(0);
5981
5982   case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5983    /* FALLTHROUGH */
5984   case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5985    REGCP_UNWIND(ST.lastcp);
5986    regcppop(rex, &maxopenparen);
5987    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5988    cur_curlyx->u.curlyx.count--;
5989    CACHEsayNO;
5990    /* NOTREACHED */
5991    assert(0);
5992
5993   case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5994    REGCP_UNWIND(ST.lastcp);
5995    regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5996    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5997     "%*s  whilem: failed, trying continuation...\n",
5998     REPORT_CODE_OFF+depth*2, "")
5999    );
6000   do_whilem_B_max:
6001    if (cur_curlyx->u.curlyx.count >= REG_INFTY
6002     && ckWARN(WARN_REGEXP)
6003     && !reginfo->warned)
6004    {
6005     reginfo->warned = TRUE;
6006     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6007      "Complex regular subexpression recursion limit (%d) "
6008      "exceeded",
6009      REG_INFTY - 1);
6010    }
6011
6012    /* now try B */
6013    ST.save_curlyx = cur_curlyx;
6014    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6015    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
6016         locinput);
6017    /* NOTREACHED */
6018    assert(0);
6019
6020   case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
6021    cur_curlyx = ST.save_curlyx;
6022    REGCP_UNWIND(ST.lastcp);
6023    regcppop(rex, &maxopenparen);
6024
6025    if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
6026     /* Maximum greed exceeded */
6027     if (cur_curlyx->u.curlyx.count >= REG_INFTY
6028      && ckWARN(WARN_REGEXP)
6029      && !reginfo->warned)
6030     {
6031      reginfo->warned = TRUE;
6032      Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6033       "Complex regular subexpression recursion "
6034       "limit (%d) exceeded",
6035       REG_INFTY - 1);
6036     }
6037     cur_curlyx->u.curlyx.count--;
6038     CACHEsayNO;
6039    }
6040
6041    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6042     "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6043    );
6044    /* Try grabbing another A and see if it helps. */
6045    cur_curlyx->u.curlyx.lastloc = locinput;
6046    ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6047        maxopenparen);
6048    REGCP_SET(ST.lastcp);
6049    PUSH_STATE_GOTO(WHILEM_A_min,
6050     /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6051     locinput);
6052    /* NOTREACHED */
6053    assert(0);
6054
6055 #undef  ST
6056 #define ST st->u.branch
6057
6058   case BRANCHJ:     /*  /(...|A|...)/ with long next pointer */
6059    next = scan + ARG(scan);
6060    if (next == scan)
6061     next = NULL;
6062    scan = NEXTOPER(scan);
6063    /* FALLTHROUGH */
6064
6065   case BRANCH:     /*  /(...|A|...)/ */
6066    scan = NEXTOPER(scan); /* scan now points to inner node */
6067    ST.lastparen = rex->lastparen;
6068    ST.lastcloseparen = rex->lastcloseparen;
6069    ST.next_branch = next;
6070    REGCP_SET(ST.cp);
6071
6072    /* Now go into the branch */
6073    if (has_cutgroup) {
6074     PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
6075    } else {
6076     PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
6077    }
6078    /* NOTREACHED */
6079    assert(0);
6080
6081   case CUTGROUP:  /*  /(*THEN)/  */
6082    sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
6083     MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6084    PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
6085    /* NOTREACHED */
6086    assert(0);
6087
6088   case CUTGROUP_next_fail:
6089    do_cutgroup = 1;
6090    no_final = 1;
6091    if (st->u.mark.mark_name)
6092     sv_commit = st->u.mark.mark_name;
6093    sayNO;
6094    /* NOTREACHED */
6095    assert(0);
6096
6097   case BRANCH_next:
6098    sayYES;
6099    /* NOTREACHED */
6100    assert(0);
6101
6102   case BRANCH_next_fail: /* that branch failed; try the next, if any */
6103    if (do_cutgroup) {
6104     do_cutgroup = 0;
6105     no_final = 0;
6106    }
6107    REGCP_UNWIND(ST.cp);
6108    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6109    scan = ST.next_branch;
6110    /* no more branches? */
6111    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
6112     DEBUG_EXECUTE_r({
6113      PerlIO_printf( Perl_debug_log,
6114       "%*s  %sBRANCH failed...%s\n",
6115       REPORT_CODE_OFF+depth*2, "",
6116       PL_colors[4],
6117       PL_colors[5] );
6118     });
6119     sayNO_SILENT;
6120    }
6121    continue; /* execute next BRANCH[J] op */
6122    /* NOTREACHED */
6123    assert(0);
6124
6125   case MINMOD: /* next op will be non-greedy, e.g. A*?  */
6126    minmod = 1;
6127    break;
6128
6129 #undef  ST
6130 #define ST st->u.curlym
6131
6132   case CURLYM: /* /A{m,n}B/ where A is fixed-length */
6133
6134    /* This is an optimisation of CURLYX that enables us to push
6135    * only a single backtracking state, no matter how many matches
6136    * there are in {m,n}. It relies on the pattern being constant
6137    * length, with no parens to influence future backrefs
6138    */
6139
6140    ST.me = scan;
6141    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6142
6143    ST.lastparen      = rex->lastparen;
6144    ST.lastcloseparen = rex->lastcloseparen;
6145
6146    /* if paren positive, emulate an OPEN/CLOSE around A */
6147    if (ST.me->flags) {
6148     U32 paren = ST.me->flags;
6149     if (paren > maxopenparen)
6150      maxopenparen = paren;
6151     scan += NEXT_OFF(scan); /* Skip former OPEN. */
6152    }
6153    ST.A = scan;
6154    ST.B = next;
6155    ST.alen = 0;
6156    ST.count = 0;
6157    ST.minmod = minmod;
6158    minmod = 0;
6159    ST.c1 = CHRTEST_UNINIT;
6160    REGCP_SET(ST.cp);
6161
6162    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
6163     goto curlym_do_B;
6164
6165   curlym_do_A: /* execute the A in /A{m,n}B/  */
6166    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
6167    /* NOTREACHED */
6168    assert(0);
6169
6170   case CURLYM_A: /* we've just matched an A */
6171    ST.count++;
6172    /* after first match, determine A's length: u.curlym.alen */
6173    if (ST.count == 1) {
6174     if (reginfo->is_utf8_target) {
6175      char *s = st->locinput;
6176      while (s < locinput) {
6177       ST.alen++;
6178       s += UTF8SKIP(s);
6179      }
6180     }
6181     else {
6182      ST.alen = locinput - st->locinput;
6183     }
6184     if (ST.alen == 0)
6185      ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
6186    }
6187    DEBUG_EXECUTE_r(
6188     PerlIO_printf(Perl_debug_log,
6189       "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
6190       (int)(REPORT_CODE_OFF+(depth*2)), "",
6191       (IV) ST.count, (IV)ST.alen)
6192    );
6193
6194    if (cur_eval && cur_eval->u.eval.close_paren &&
6195     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6196     goto fake_end;
6197
6198    {
6199     I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
6200     if ( max == REG_INFTY || ST.count < max )
6201      goto curlym_do_A; /* try to match another A */
6202    }
6203    goto curlym_do_B; /* try to match B */
6204
6205   case CURLYM_A_fail: /* just failed to match an A */
6206    REGCP_UNWIND(ST.cp);
6207
6208    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
6209     || (cur_eval && cur_eval->u.eval.close_paren &&
6210      cur_eval->u.eval.close_paren == (U32)ST.me->flags))
6211     sayNO;
6212
6213   curlym_do_B: /* execute the B in /A{m,n}B/  */
6214    if (ST.c1 == CHRTEST_UNINIT) {
6215     /* calculate c1 and c2 for possible match of 1st char
6216     * following curly */
6217     ST.c1 = ST.c2 = CHRTEST_VOID;
6218     assert(ST.B);
6219     if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
6220      regnode *text_node = ST.B;
6221      if (! HAS_TEXT(text_node))
6222       FIND_NEXT_IMPT(text_node);
6223      /* this used to be
6224
6225       (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
6226
6227        But the former is redundant in light of the latter.
6228
6229        if this changes back then the macro for
6230        IS_TEXT and friends need to change.
6231      */
6232      if (PL_regkind[OP(text_node)] == EXACT) {
6233       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6234       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6235       reginfo))
6236       {
6237        sayNO;
6238       }
6239      }
6240     }
6241    }
6242
6243    DEBUG_EXECUTE_r(
6244     PerlIO_printf(Perl_debug_log,
6245      "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
6246      (int)(REPORT_CODE_OFF+(depth*2)),
6247      "", (IV)ST.count)
6248     );
6249    if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
6250     if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
6251      if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6252       && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6253      {
6254       /* simulate B failing */
6255       DEBUG_OPTIMISE_r(
6256        PerlIO_printf(Perl_debug_log,
6257         "%*s  CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
6258         (int)(REPORT_CODE_OFF+(depth*2)),"",
6259         valid_utf8_to_uvchr((U8 *) locinput, NULL),
6260         valid_utf8_to_uvchr(ST.c1_utf8, NULL),
6261         valid_utf8_to_uvchr(ST.c2_utf8, NULL))
6262       );
6263       state_num = CURLYM_B_fail;
6264       goto reenter_switch;
6265      }
6266     }
6267     else if (nextchr != ST.c1 && nextchr != ST.c2) {
6268      /* simulate B failing */
6269      DEBUG_OPTIMISE_r(
6270       PerlIO_printf(Perl_debug_log,
6271        "%*s  CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
6272        (int)(REPORT_CODE_OFF+(depth*2)),"",
6273        (int) nextchr, ST.c1, ST.c2)
6274      );
6275      state_num = CURLYM_B_fail;
6276      goto reenter_switch;
6277     }
6278    }
6279
6280    if (ST.me->flags) {
6281     /* emulate CLOSE: mark current A as captured */
6282     I32 paren = ST.me->flags;
6283     if (ST.count) {
6284      rex->offs[paren].start
6285       = HOPc(locinput, -ST.alen) - reginfo->strbeg;
6286      rex->offs[paren].end = locinput - reginfo->strbeg;
6287      if ((U32)paren > rex->lastparen)
6288       rex->lastparen = paren;
6289      rex->lastcloseparen = paren;
6290     }
6291     else
6292      rex->offs[paren].end = -1;
6293     if (cur_eval && cur_eval->u.eval.close_paren &&
6294      cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6295     {
6296      if (ST.count)
6297       goto fake_end;
6298      else
6299       sayNO;
6300     }
6301    }
6302
6303    PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
6304    /* NOTREACHED */
6305    assert(0);
6306
6307   case CURLYM_B_fail: /* just failed to match a B */
6308    REGCP_UNWIND(ST.cp);
6309    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6310    if (ST.minmod) {
6311     I32 max = ARG2(ST.me);
6312     if (max != REG_INFTY && ST.count == max)
6313      sayNO;
6314     goto curlym_do_A; /* try to match a further A */
6315    }
6316    /* backtrack one A */
6317    if (ST.count == ARG1(ST.me) /* min */)
6318     sayNO;
6319    ST.count--;
6320    SET_locinput(HOPc(locinput, -ST.alen));
6321    goto curlym_do_B; /* try to match B */
6322
6323 #undef ST
6324 #define ST st->u.curly
6325
6326 #define CURLY_SETPAREN(paren, success) \
6327  if (paren) { \
6328   if (success) { \
6329    rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6330    rex->offs[paren].end = locinput - reginfo->strbeg; \
6331    if (paren > rex->lastparen) \
6332     rex->lastparen = paren; \
6333    rex->lastcloseparen = paren; \
6334   } \
6335   else { \
6336    rex->offs[paren].end = -1; \
6337    rex->lastparen      = ST.lastparen; \
6338    rex->lastcloseparen = ST.lastcloseparen; \
6339   } \
6340  }
6341
6342   case STAR:  /*  /A*B/ where A is width 1 char */
6343    ST.paren = 0;
6344    ST.min = 0;
6345    ST.max = REG_INFTY;
6346    scan = NEXTOPER(scan);
6347    goto repeat;
6348
6349   case PLUS:  /*  /A+B/ where A is width 1 char */
6350    ST.paren = 0;
6351    ST.min = 1;
6352    ST.max = REG_INFTY;
6353    scan = NEXTOPER(scan);
6354    goto repeat;
6355
6356   case CURLYN:  /*  /(A){m,n}B/ where A is width 1 char */
6357    ST.paren = scan->flags; /* Which paren to set */
6358    ST.lastparen      = rex->lastparen;
6359    ST.lastcloseparen = rex->lastcloseparen;
6360    if (ST.paren > maxopenparen)
6361     maxopenparen = ST.paren;
6362    ST.min = ARG1(scan);  /* min to match */
6363    ST.max = ARG2(scan);  /* max to match */
6364    if (cur_eval && cur_eval->u.eval.close_paren &&
6365     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6366     ST.min=1;
6367     ST.max=1;
6368    }
6369    scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6370    goto repeat;
6371
6372   case CURLY:  /*  /A{m,n}B/ where A is width 1 char */
6373    ST.paren = 0;
6374    ST.min = ARG1(scan);  /* min to match */
6375    ST.max = ARG2(scan);  /* max to match */
6376    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6377   repeat:
6378    /*
6379    * Lookahead to avoid useless match attempts
6380    * when we know what character comes next.
6381    *
6382    * Used to only do .*x and .*?x, but now it allows
6383    * for )'s, ('s and (?{ ... })'s to be in the way
6384    * of the quantifier and the EXACT-like node.  -- japhy
6385    */
6386
6387    assert(ST.min <= ST.max);
6388    if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6389     ST.c1 = ST.c2 = CHRTEST_VOID;
6390    }
6391    else {
6392     regnode *text_node = next;
6393
6394     if (! HAS_TEXT(text_node))
6395      FIND_NEXT_IMPT(text_node);
6396
6397     if (! HAS_TEXT(text_node))
6398      ST.c1 = ST.c2 = CHRTEST_VOID;
6399     else {
6400      if ( PL_regkind[OP(text_node)] != EXACT ) {
6401       ST.c1 = ST.c2 = CHRTEST_VOID;
6402      }
6403      else {
6404
6405      /*  Currently we only get here when
6406
6407       PL_rekind[OP(text_node)] == EXACT
6408
6409       if this changes back then the macro for IS_TEXT and
6410       friends need to change. */
6411       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6412       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6413       reginfo))
6414       {
6415        sayNO;
6416       }
6417      }
6418     }
6419    }
6420
6421    ST.A = scan;
6422    ST.B = next;
6423    if (minmod) {
6424     char *li = locinput;
6425     minmod = 0;
6426     if (ST.min &&
6427       regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6428        < ST.min)
6429      sayNO;
6430     SET_locinput(li);
6431     ST.count = ST.min;
6432     REGCP_SET(ST.cp);
6433     if (ST.c1 == CHRTEST_VOID)
6434      goto curly_try_B_min;
6435
6436     ST.oldloc = locinput;
6437
6438     /* set ST.maxpos to the furthest point along the
6439     * string that could possibly match */
6440     if  (ST.max == REG_INFTY) {
6441      ST.maxpos = reginfo->strend - 1;
6442      if (utf8_target)
6443       while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6444        ST.maxpos--;
6445     }
6446     else if (utf8_target) {
6447      int m = ST.max - ST.min;
6448      for (ST.maxpos = locinput;
6449       m >0 && ST.maxpos < reginfo->strend; m--)
6450       ST.maxpos += UTF8SKIP(ST.maxpos);
6451     }
6452     else {
6453      ST.maxpos = locinput + ST.max - ST.min;
6454      if (ST.maxpos >= reginfo->strend)
6455       ST.maxpos = reginfo->strend - 1;
6456     }
6457     goto curly_try_B_min_known;
6458
6459    }
6460    else {
6461     /* avoid taking address of locinput, so it can remain
6462     * a register var */
6463     char *li = locinput;
6464     ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6465     if (ST.count < ST.min)
6466      sayNO;
6467     SET_locinput(li);
6468     if ((ST.count > ST.min)
6469      && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6470     {
6471      /* A{m,n} must come at the end of the string, there's
6472      * no point in backing off ... */
6473      ST.min = ST.count;
6474      /* ...except that $ and \Z can match before *and* after
6475      newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6476      We may back off by one in this case. */
6477      if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6478       ST.min--;
6479     }
6480     REGCP_SET(ST.cp);
6481     goto curly_try_B_max;
6482    }
6483    /* NOTREACHED */
6484    assert(0);
6485
6486   case CURLY_B_min_known_fail:
6487    /* failed to find B in a non-greedy match where c1,c2 valid */
6488
6489    REGCP_UNWIND(ST.cp);
6490    if (ST.paren) {
6491     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6492    }
6493    /* Couldn't or didn't -- move forward. */
6494    ST.oldloc = locinput;
6495    if (utf8_target)
6496     locinput += UTF8SKIP(locinput);
6497    else
6498     locinput++;
6499    ST.count++;
6500   curly_try_B_min_known:
6501    /* find the next place where 'B' could work, then call B */
6502    {
6503     int n;
6504     if (utf8_target) {
6505      n = (ST.oldloc == locinput) ? 0 : 1;
6506      if (ST.c1 == ST.c2) {
6507       /* set n to utf8_distance(oldloc, locinput) */
6508       while (locinput <= ST.maxpos
6509        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6510       {
6511        locinput += UTF8SKIP(locinput);
6512        n++;
6513       }
6514      }
6515      else {
6516       /* set n to utf8_distance(oldloc, locinput) */
6517       while (locinput <= ST.maxpos
6518        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6519        && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6520       {
6521        locinput += UTF8SKIP(locinput);
6522        n++;
6523       }
6524      }
6525     }
6526     else {  /* Not utf8_target */
6527      if (ST.c1 == ST.c2) {
6528       while (locinput <= ST.maxpos &&
6529        UCHARAT(locinput) != ST.c1)
6530        locinput++;
6531      }
6532      else {
6533       while (locinput <= ST.maxpos
6534        && UCHARAT(locinput) != ST.c1
6535        && UCHARAT(locinput) != ST.c2)
6536        locinput++;
6537      }
6538      n = locinput - ST.oldloc;
6539     }
6540     if (locinput > ST.maxpos)
6541      sayNO;
6542     if (n) {
6543      /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6544      * at b; check that everything between oldloc and
6545      * locinput matches */
6546      char *li = ST.oldloc;
6547      ST.count += n;
6548      if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6549       sayNO;
6550      assert(n == REG_INFTY || locinput == li);
6551     }
6552     CURLY_SETPAREN(ST.paren, ST.count);
6553     if (cur_eval && cur_eval->u.eval.close_paren &&
6554      cur_eval->u.eval.close_paren == (U32)ST.paren) {
6555      goto fake_end;
6556     }
6557     PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6558    }
6559    /* NOTREACHED */
6560    assert(0);
6561
6562   case CURLY_B_min_fail:
6563    /* failed to find B in a non-greedy match where c1,c2 invalid */
6564
6565    REGCP_UNWIND(ST.cp);
6566    if (ST.paren) {
6567     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6568    }
6569    /* failed -- move forward one */
6570    {
6571     char *li = locinput;
6572     if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6573      sayNO;
6574     }
6575     locinput = li;
6576    }
6577    {
6578     ST.count++;
6579     if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6580       ST.count > 0)) /* count overflow ? */
6581     {
6582     curly_try_B_min:
6583      CURLY_SETPAREN(ST.paren, ST.count);
6584      if (cur_eval && cur_eval->u.eval.close_paren &&
6585       cur_eval->u.eval.close_paren == (U32)ST.paren) {
6586       goto fake_end;
6587      }
6588      PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6589     }
6590    }
6591    sayNO;
6592    /* NOTREACHED */
6593    assert(0);
6594
6595   curly_try_B_max:
6596    /* a successful greedy match: now try to match B */
6597    if (cur_eval && cur_eval->u.eval.close_paren &&
6598     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6599     goto fake_end;
6600    }
6601    {
6602     bool could_match = locinput < reginfo->strend;
6603
6604     /* If it could work, try it. */
6605     if (ST.c1 != CHRTEST_VOID && could_match) {
6606      if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6607      {
6608       could_match = memEQ(locinput,
6609            ST.c1_utf8,
6610            UTF8SKIP(locinput))
6611          || memEQ(locinput,
6612            ST.c2_utf8,
6613            UTF8SKIP(locinput));
6614      }
6615      else {
6616       could_match = UCHARAT(locinput) == ST.c1
6617          || UCHARAT(locinput) == ST.c2;
6618      }
6619     }
6620     if (ST.c1 == CHRTEST_VOID || could_match) {
6621      CURLY_SETPAREN(ST.paren, ST.count);
6622      PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6623      /* NOTREACHED */
6624      assert(0);
6625     }
6626    }
6627    /* FALLTHROUGH */
6628
6629   case CURLY_B_max_fail:
6630    /* failed to find B in a greedy match */
6631
6632    REGCP_UNWIND(ST.cp);
6633    if (ST.paren) {
6634     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6635    }
6636    /*  back up. */
6637    if (--ST.count < ST.min)
6638     sayNO;
6639    locinput = HOPc(locinput, -1);
6640    goto curly_try_B_max;
6641
6642 #undef ST
6643
6644   case END: /*  last op of main pattern  */
6645    fake_end:
6646    if (cur_eval) {
6647     /* we've just finished A in /(??{A})B/; now continue with B */
6648
6649     st->u.eval.prev_rex = rex_sv;  /* inner */
6650
6651     /* Save *all* the positions. */
6652     st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6653     rex_sv = cur_eval->u.eval.prev_rex;
6654     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6655     SET_reg_curpm(rex_sv);
6656     rex = ReANY(rex_sv);
6657     rexi = RXi_GET(rex);
6658     cur_curlyx = cur_eval->u.eval.prev_curlyx;
6659
6660     REGCP_SET(st->u.eval.lastcp);
6661
6662     /* Restore parens of the outer rex without popping the
6663     * savestack */
6664     S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6665           &maxopenparen);
6666
6667     st->u.eval.prev_eval = cur_eval;
6668     cur_eval = cur_eval->u.eval.prev_eval;
6669     DEBUG_EXECUTE_r(
6670      PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6671          REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6672     if ( nochange_depth )
6673      nochange_depth--;
6674
6675     PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6676          locinput); /* match B */
6677    }
6678
6679    if (locinput < reginfo->till) {
6680     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6681          "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6682          PL_colors[4],
6683          (long)(locinput - startpos),
6684          (long)(reginfo->till - startpos),
6685          PL_colors[5]));
6686
6687     sayNO_SILENT;  /* Cannot match: too short. */
6688    }
6689    sayYES;   /* Success! */
6690
6691   case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6692    DEBUG_EXECUTE_r(
6693    PerlIO_printf(Perl_debug_log,
6694     "%*s  %ssubpattern success...%s\n",
6695     REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6696    sayYES;   /* Success! */
6697
6698 #undef  ST
6699 #define ST st->u.ifmatch
6700
6701   {
6702    char *newstart;
6703
6704   case SUSPEND: /* (?>A) */
6705    ST.wanted = 1;
6706    newstart = locinput;
6707    goto do_ifmatch;
6708
6709   case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6710    ST.wanted = 0;
6711    goto ifmatch_trivial_fail_test;
6712
6713   case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6714    ST.wanted = 1;
6715   ifmatch_trivial_fail_test:
6716    if (scan->flags) {
6717     char * const s = HOPBACKc(locinput, scan->flags);
6718     if (!s) {
6719      /* trivial fail */
6720      if (logical) {
6721       logical = 0;
6722       sw = 1 - cBOOL(ST.wanted);
6723      }
6724      else if (ST.wanted)
6725       sayNO;
6726      next = scan + ARG(scan);
6727      if (next == scan)
6728       next = NULL;
6729      break;
6730     }
6731     newstart = s;
6732    }
6733    else
6734     newstart = locinput;
6735
6736   do_ifmatch:
6737    ST.me = scan;
6738    ST.logical = logical;
6739    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6740
6741    /* execute body of (?...A) */
6742    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6743    /* NOTREACHED */
6744    assert(0);
6745   }
6746
6747   case IFMATCH_A_fail: /* body of (?...A) failed */
6748    ST.wanted = !ST.wanted;
6749    /* FALLTHROUGH */
6750
6751   case IFMATCH_A: /* body of (?...A) succeeded */
6752    if (ST.logical) {
6753     sw = cBOOL(ST.wanted);
6754    }
6755    else if (!ST.wanted)
6756     sayNO;
6757
6758    if (OP(ST.me) != SUSPEND) {
6759     /* restore old position except for (?>...) */
6760     locinput = st->locinput;
6761    }
6762    scan = ST.me + ARG(ST.me);
6763    if (scan == ST.me)
6764     scan = NULL;
6765    continue; /* execute B */
6766
6767 #undef ST
6768
6769   case LONGJMP: /*  alternative with many branches compiles to
6770      * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6771    next = scan + ARG(scan);
6772    if (next == scan)
6773     next = NULL;
6774    break;
6775
6776   case COMMIT:  /*  (*COMMIT)  */
6777    reginfo->cutpoint = reginfo->strend;
6778    /* FALLTHROUGH */
6779
6780   case PRUNE:   /*  (*PRUNE)   */
6781    if (!scan->flags)
6782     sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6783    PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6784    /* NOTREACHED */
6785    assert(0);
6786
6787   case COMMIT_next_fail:
6788    no_final = 1;
6789    /* FALLTHROUGH */
6790
6791   case OPFAIL:   /* (*FAIL)  */
6792    sayNO;
6793    /* NOTREACHED */
6794    assert(0);
6795
6796 #define ST st->u.mark
6797   case MARKPOINT: /*  (*MARK:foo)  */
6798    ST.prev_mark = mark_state;
6799    ST.mark_name = sv_commit = sv_yes_mark
6800     = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6801    mark_state = st;
6802    ST.mark_loc = locinput;
6803    PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6804    /* NOTREACHED */
6805    assert(0);
6806
6807   case MARKPOINT_next:
6808    mark_state = ST.prev_mark;
6809    sayYES;
6810    /* NOTREACHED */
6811    assert(0);
6812
6813   case MARKPOINT_next_fail:
6814    if (popmark && sv_eq(ST.mark_name,popmark))
6815    {
6816     if (ST.mark_loc > startpoint)
6817      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6818     popmark = NULL; /* we found our mark */
6819     sv_commit = ST.mark_name;
6820
6821     DEBUG_EXECUTE_r({
6822       PerlIO_printf(Perl_debug_log,
6823        "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6824        REPORT_CODE_OFF+depth*2, "",
6825        PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6826     });
6827    }
6828    mark_state = ST.prev_mark;
6829    sv_yes_mark = mark_state ?
6830     mark_state->u.mark.mark_name : NULL;
6831    sayNO;
6832    /* NOTREACHED */
6833    assert(0);
6834
6835   case SKIP:  /*  (*SKIP)  */
6836    if (scan->flags) {
6837     /* (*SKIP) : if we fail we cut here*/
6838     ST.mark_name = NULL;
6839     ST.mark_loc = locinput;
6840     PUSH_STATE_GOTO(SKIP_next,next, locinput);
6841    } else {
6842     /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6843     otherwise do nothing.  Meaning we need to scan
6844     */
6845     regmatch_state *cur = mark_state;
6846     SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6847
6848     while (cur) {
6849      if ( sv_eq( cur->u.mark.mark_name,
6850         find ) )
6851      {
6852       ST.mark_name = find;
6853       PUSH_STATE_GOTO( SKIP_next, next, locinput);
6854      }
6855      cur = cur->u.mark.prev_mark;
6856     }
6857    }
6858    /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6859    break;
6860
6861   case SKIP_next_fail:
6862    if (ST.mark_name) {
6863     /* (*CUT:NAME) - Set up to search for the name as we
6864     collapse the stack*/
6865     popmark = ST.mark_name;
6866    } else {
6867     /* (*CUT) - No name, we cut here.*/
6868     if (ST.mark_loc > startpoint)
6869      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6870     /* but we set sv_commit to latest mark_name if there
6871     is one so they can test to see how things lead to this
6872     cut */
6873     if (mark_state)
6874      sv_commit=mark_state->u.mark.mark_name;
6875    }
6876    no_final = 1;
6877    sayNO;
6878    /* NOTREACHED */
6879    assert(0);
6880 #undef ST
6881
6882   case LNBREAK: /* \R */
6883    if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6884     locinput += n;
6885    } else
6886     sayNO;
6887    break;
6888
6889   default:
6890    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6891       PTR2UV(scan), OP(scan));
6892    Perl_croak(aTHX_ "regexp memory corruption");
6893
6894   /* this is a point to jump to in order to increment
6895   * locinput by one character */
6896   increment_locinput:
6897    assert(!NEXTCHR_IS_EOS);
6898    if (utf8_target) {
6899     locinput += PL_utf8skip[nextchr];
6900     /* locinput is allowed to go 1 char off the end, but not 2+ */
6901     if (locinput > reginfo->strend)
6902      sayNO;
6903    }
6904    else
6905     locinput++;
6906    break;
6907
6908   } /* end switch */
6909
6910   /* switch break jumps here */
6911   scan = next; /* prepare to execute the next op and ... */
6912   continue;    /* ... jump back to the top, reusing st */
6913   /* NOTREACHED */
6914   assert(0);
6915
6916  push_yes_state:
6917   /* push a state that backtracks on success */
6918   st->u.yes.prev_yes_state = yes_state;
6919   yes_state = st;
6920   /* FALLTHROUGH */
6921  push_state:
6922   /* push a new regex state, then continue at scan  */
6923   {
6924    regmatch_state *newst;
6925
6926    DEBUG_STACK_r({
6927     regmatch_state *cur = st;
6928     regmatch_state *curyes = yes_state;
6929     int curd = depth;
6930     regmatch_slab *slab = PL_regmatch_slab;
6931     for (;curd > -1;cur--,curd--) {
6932      if (cur < SLAB_FIRST(slab)) {
6933       slab = slab->prev;
6934       cur = SLAB_LAST(slab);
6935      }
6936      PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6937       REPORT_CODE_OFF + 2 + depth * 2,"",
6938       curd, PL_reg_name[cur->resume_state],
6939       (curyes == cur) ? "yes" : ""
6940      );
6941      if (curyes == cur)
6942       curyes = cur->u.yes.prev_yes_state;
6943     }
6944    } else
6945     DEBUG_STATE_pp("push")
6946    );
6947    depth++;
6948    st->locinput = locinput;
6949    newst = st+1;
6950    if (newst >  SLAB_LAST(PL_regmatch_slab))
6951     newst = S_push_slab(aTHX);
6952    PL_regmatch_state = newst;
6953
6954    locinput = pushinput;
6955    st = newst;
6956    continue;
6957    /* NOTREACHED */
6958    assert(0);
6959   }
6960  }
6961
6962  /*
6963  * We get here only if there's trouble -- normally "case END" is
6964  * the terminating point.
6965  */
6966  Perl_croak(aTHX_ "corrupted regexp pointers");
6967  /* NOTREACHED */
6968  sayNO;
6969
6970 yes:
6971  if (yes_state) {
6972   /* we have successfully completed a subexpression, but we must now
6973   * pop to the state marked by yes_state and continue from there */
6974   assert(st != yes_state);
6975 #ifdef DEBUGGING
6976   while (st != yes_state) {
6977    st--;
6978    if (st < SLAB_FIRST(PL_regmatch_slab)) {
6979     PL_regmatch_slab = PL_regmatch_slab->prev;
6980     st = SLAB_LAST(PL_regmatch_slab);
6981    }
6982    DEBUG_STATE_r({
6983     if (no_final) {
6984      DEBUG_STATE_pp("pop (no final)");
6985     } else {
6986      DEBUG_STATE_pp("pop (yes)");
6987     }
6988    });
6989    depth--;
6990   }
6991 #else
6992   while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6993    || yes_state > SLAB_LAST(PL_regmatch_slab))
6994   {
6995    /* not in this slab, pop slab */
6996    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6997    PL_regmatch_slab = PL_regmatch_slab->prev;
6998    st = SLAB_LAST(PL_regmatch_slab);
6999   }
7000   depth -= (st - yes_state);
7001 #endif
7002   st = yes_state;
7003   yes_state = st->u.yes.prev_yes_state;
7004   PL_regmatch_state = st;
7005
7006   if (no_final)
7007    locinput= st->locinput;
7008   state_num = st->resume_state + no_final;
7009   goto reenter_switch;
7010  }
7011
7012  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
7013       PL_colors[4], PL_colors[5]));
7014
7015  if (reginfo->info_aux_eval) {
7016   /* each successfully executed (?{...}) block does the equivalent of
7017   *   local $^R = do {...}
7018   * When popping the save stack, all these locals would be undone;
7019   * bypass this by setting the outermost saved $^R to the latest
7020   * value */
7021   /* I dont know if this is needed or works properly now.
7022   * see code related to PL_replgv elsewhere in this file.
7023   * Yves
7024   */
7025   if (oreplsv != GvSV(PL_replgv))
7026    sv_setsv(oreplsv, GvSV(PL_replgv));
7027  }
7028  result = 1;
7029  goto final_exit;
7030
7031 no:
7032  DEBUG_EXECUTE_r(
7033   PerlIO_printf(Perl_debug_log,
7034    "%*s  %sfailed...%s\n",
7035    REPORT_CODE_OFF+depth*2, "",
7036    PL_colors[4], PL_colors[5])
7037   );
7038
7039 no_silent:
7040  if (no_final) {
7041   if (yes_state) {
7042    goto yes;
7043   } else {
7044    goto final_exit;
7045   }
7046  }
7047  if (depth) {
7048   /* there's a previous state to backtrack to */
7049   st--;
7050   if (st < SLAB_FIRST(PL_regmatch_slab)) {
7051    PL_regmatch_slab = PL_regmatch_slab->prev;
7052    st = SLAB_LAST(PL_regmatch_slab);
7053   }
7054   PL_regmatch_state = st;
7055   locinput= st->locinput;
7056
7057   DEBUG_STATE_pp("pop");
7058   depth--;
7059   if (yes_state == st)
7060    yes_state = st->u.yes.prev_yes_state;
7061
7062   state_num = st->resume_state + 1; /* failure = success + 1 */
7063   goto reenter_switch;
7064  }
7065  result = 0;
7066
7067   final_exit:
7068  if (rex->intflags & PREGf_VERBARG_SEEN) {
7069   SV *sv_err = get_sv("REGERROR", 1);
7070   SV *sv_mrk = get_sv("REGMARK", 1);
7071   if (result) {
7072    sv_commit = &PL_sv_no;
7073    if (!sv_yes_mark)
7074     sv_yes_mark = &PL_sv_yes;
7075   } else {
7076    if (!sv_commit)
7077     sv_commit = &PL_sv_yes;
7078    sv_yes_mark = &PL_sv_no;
7079   }
7080   assert(sv_err);
7081   assert(sv_mrk);
7082   sv_setsv(sv_err, sv_commit);
7083   sv_setsv(sv_mrk, sv_yes_mark);
7084  }
7085
7086
7087  if (last_pushed_cv) {
7088   dSP;
7089   POP_MULTICALL;
7090   PERL_UNUSED_VAR(SP);
7091  }
7092
7093  assert(!result ||  locinput - reginfo->strbeg >= 0);
7094  return result ?  locinput - reginfo->strbeg : -1;
7095 }
7096
7097 /*
7098  - regrepeat - repeatedly match something simple, report how many
7099  *
7100  * What 'simple' means is a node which can be the operand of a quantifier like
7101  * '+', or {1,3}
7102  *
7103  * startposp - pointer a pointer to the start position.  This is updated
7104  *             to point to the byte following the highest successful
7105  *             match.
7106  * p         - the regnode to be repeatedly matched against.
7107  * reginfo   - struct holding match state, such as strend
7108  * max       - maximum number of things to match.
7109  * depth     - (for debugging) backtracking depth.
7110  */
7111 STATIC I32
7112 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
7113    regmatch_info *const reginfo, I32 max, int depth)
7114 {
7115  char *scan;     /* Pointer to current position in target string */
7116  I32 c;
7117  char *loceol = reginfo->strend;   /* local version */
7118  I32 hardcount = 0;  /* How many matches so far */
7119  bool utf8_target = reginfo->is_utf8_target;
7120  int to_complement = 0;  /* Invert the result? */
7121  UV utf8_flags;
7122  _char_class_number classnum;
7123 #ifndef DEBUGGING
7124  PERL_UNUSED_ARG(depth);
7125 #endif
7126
7127  PERL_ARGS_ASSERT_REGREPEAT;
7128
7129  scan = *startposp;
7130  if (max == REG_INFTY)
7131   max = I32_MAX;
7132  else if (! utf8_target && loceol - scan > max)
7133   loceol = scan + max;
7134
7135  /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
7136  * to the maximum of how far we should go in it (leaving it set to the real
7137  * end, if the maximum permissible would take us beyond that).  This allows
7138  * us to make the loop exit condition that we haven't gone past <loceol> to
7139  * also mean that we haven't exceeded the max permissible count, saving a
7140  * test each time through the loop.  But it assumes that the OP matches a
7141  * single byte, which is true for most of the OPs below when applied to a
7142  * non-UTF-8 target.  Those relatively few OPs that don't have this
7143  * characteristic will have to compensate.
7144  *
7145  * There is no adjustment for UTF-8 targets, as the number of bytes per
7146  * character varies.  OPs will have to test both that the count is less
7147  * than the max permissible (using <hardcount> to keep track), and that we
7148  * are still within the bounds of the string (using <loceol>.  A few OPs
7149  * match a single byte no matter what the encoding.  They can omit the max
7150  * test if, for the UTF-8 case, they do the adjustment that was skipped
7151  * above.
7152  *
7153  * Thus, the code above sets things up for the common case; and exceptional
7154  * cases need extra work; the common case is to make sure <scan> doesn't
7155  * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
7156  * count doesn't exceed the maximum permissible */
7157
7158  switch (OP(p)) {
7159  case REG_ANY:
7160   if (utf8_target) {
7161    while (scan < loceol && hardcount < max && *scan != '\n') {
7162     scan += UTF8SKIP(scan);
7163     hardcount++;
7164    }
7165   } else {
7166    while (scan < loceol && *scan != '\n')
7167     scan++;
7168   }
7169   break;
7170  case SANY:
7171   if (utf8_target) {
7172    while (scan < loceol && hardcount < max) {
7173     scan += UTF8SKIP(scan);
7174     hardcount++;
7175    }
7176   }
7177   else
7178    scan = loceol;
7179   break;
7180  case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
7181   if (utf8_target && loceol - scan > max) {
7182
7183    /* <loceol> hadn't been adjusted in the UTF-8 case */
7184    scan +=  max;
7185   }
7186   else {
7187    scan = loceol;
7188   }
7189   break;
7190  case EXACT:
7191   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7192
7193   c = (U8)*STRING(p);
7194
7195   /* Can use a simple loop if the pattern char to match on is invariant
7196   * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
7197   * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
7198   * true iff it doesn't matter if the argument is in UTF-8 or not */
7199   if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
7200    if (utf8_target && loceol - scan > max) {
7201     /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7202     * since here, to match at all, 1 char == 1 byte */
7203     loceol = scan + max;
7204    }
7205    while (scan < loceol && UCHARAT(scan) == c) {
7206     scan++;
7207    }
7208   }
7209   else if (reginfo->is_utf8_pat) {
7210    if (utf8_target) {
7211     STRLEN scan_char_len;
7212
7213     /* When both target and pattern are UTF-8, we have to do
7214     * string EQ */
7215     while (hardcount < max
7216      && scan < loceol
7217      && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
7218      && memEQ(scan, STRING(p), scan_char_len))
7219     {
7220      scan += scan_char_len;
7221      hardcount++;
7222     }
7223    }
7224    else if (! UTF8_IS_ABOVE_LATIN1(c)) {
7225
7226     /* Target isn't utf8; convert the character in the UTF-8
7227     * pattern to non-UTF8, and do a simple loop */
7228     c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
7229     while (scan < loceol && UCHARAT(scan) == c) {
7230      scan++;
7231     }
7232    } /* else pattern char is above Latin1, can't possibly match the
7233     non-UTF-8 target */
7234   }
7235   else {
7236
7237    /* Here, the string must be utf8; pattern isn't, and <c> is
7238    * different in utf8 than not, so can't compare them directly.
7239    * Outside the loop, find the two utf8 bytes that represent c, and
7240    * then look for those in sequence in the utf8 string */
7241    U8 high = UTF8_TWO_BYTE_HI(c);
7242    U8 low = UTF8_TWO_BYTE_LO(c);
7243
7244    while (hardcount < max
7245      && scan + 1 < loceol
7246      && UCHARAT(scan) == high
7247      && UCHARAT(scan + 1) == low)
7248    {
7249     scan += 2;
7250     hardcount++;
7251    }
7252   }
7253   break;
7254
7255  case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
7256   assert(! reginfo->is_utf8_pat);
7257   /* FALLTHROUGH */
7258  case EXACTFA:
7259   utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7260   goto do_exactf;
7261
7262  case EXACTFL:
7263   utf8_flags = FOLDEQ_LOCALE;
7264   goto do_exactf;
7265
7266  case EXACTF:   /* This node only generated for non-utf8 patterns */
7267   assert(! reginfo->is_utf8_pat);
7268   utf8_flags = 0;
7269   goto do_exactf;
7270
7271  case EXACTFU_SS:
7272  case EXACTFU:
7273   utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
7274
7275  do_exactf: {
7276   int c1, c2;
7277   U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
7278
7279   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7280
7281   if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
7282           reginfo))
7283   {
7284    if (c1 == CHRTEST_VOID) {
7285     /* Use full Unicode fold matching */
7286     char *tmpeol = reginfo->strend;
7287     STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
7288     while (hardcount < max
7289       && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
7290            STRING(p), NULL, pat_len,
7291            reginfo->is_utf8_pat, utf8_flags))
7292     {
7293      scan = tmpeol;
7294      tmpeol = reginfo->strend;
7295      hardcount++;
7296     }
7297    }
7298    else if (utf8_target) {
7299     if (c1 == c2) {
7300      while (scan < loceol
7301       && hardcount < max
7302       && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
7303      {
7304       scan += UTF8SKIP(scan);
7305       hardcount++;
7306      }
7307     }
7308     else {
7309      while (scan < loceol
7310       && hardcount < max
7311       && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
7312        || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
7313      {
7314       scan += UTF8SKIP(scan);
7315       hardcount++;
7316      }
7317     }
7318    }
7319    else if (c1 == c2) {
7320     while (scan < loceol && UCHARAT(scan) == c1) {
7321      scan++;
7322     }
7323    }
7324    else {
7325     while (scan < loceol &&
7326      (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
7327     {
7328      scan++;
7329     }
7330    }
7331   }
7332   break;
7333  }
7334  case ANYOF:
7335   if (utf8_target) {
7336    while (hardcount < max
7337     && scan < loceol
7338     && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
7339    {
7340     scan += UTF8SKIP(scan);
7341     hardcount++;
7342    }
7343   } else {
7344    while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
7345     scan++;
7346   }
7347   break;
7348
7349  /* The argument (FLAGS) to all the POSIX node types is the class number */
7350
7351  case NPOSIXL:
7352   to_complement = 1;
7353   /* FALLTHROUGH */
7354
7355  case POSIXL:
7356   if (! utf8_target) {
7357    while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7358                 *scan)))
7359    {
7360     scan++;
7361    }
7362   } else {
7363    while (hardcount < max && scan < loceol
7364     && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7365                 (U8 *) scan)))
7366    {
7367     scan += UTF8SKIP(scan);
7368     hardcount++;
7369    }
7370   }
7371   break;
7372
7373  case POSIXD:
7374   if (utf8_target) {
7375    goto utf8_posix;
7376   }
7377   /* FALLTHROUGH */
7378
7379  case POSIXA:
7380   if (utf8_target && loceol - scan > max) {
7381
7382    /* We didn't adjust <loceol> at the beginning of this routine
7383    * because is UTF-8, but it is actually ok to do so, since here, to
7384    * match, 1 char == 1 byte. */
7385    loceol = scan + max;
7386   }
7387   while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
7388    scan++;
7389   }
7390   break;
7391
7392  case NPOSIXD:
7393   if (utf8_target) {
7394    to_complement = 1;
7395    goto utf8_posix;
7396   }
7397   /* FALLTHROUGH */
7398
7399  case NPOSIXA:
7400   if (! utf8_target) {
7401    while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
7402     scan++;
7403    }
7404   }
7405   else {
7406
7407    /* The complement of something that matches only ASCII matches all
7408    * non-ASCII, plus everything in ASCII that isn't in the class. */
7409    while (hardcount < max && scan < loceol
7410     && (! isASCII_utf8(scan)
7411      || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7412    {
7413     scan += UTF8SKIP(scan);
7414     hardcount++;
7415    }
7416   }
7417   break;
7418
7419  case NPOSIXU:
7420   to_complement = 1;
7421   /* FALLTHROUGH */
7422
7423  case POSIXU:
7424   if (! utf8_target) {
7425    while (scan < loceol && to_complement
7426         ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7427    {
7428     scan++;
7429    }
7430   }
7431   else {
7432  utf8_posix:
7433    classnum = (_char_class_number) FLAGS(p);
7434    if (classnum < _FIRST_NON_SWASH_CC) {
7435
7436     /* Here, a swash is needed for above-Latin1 code points.
7437     * Process as many Latin1 code points using the built-in rules.
7438     * Go to another loop to finish processing upon encountering
7439     * the first Latin1 code point.  We could do that in this loop
7440     * as well, but the other way saves having to test if the swash
7441     * has been loaded every time through the loop: extra space to
7442     * save a test. */
7443     while (hardcount < max && scan < loceol) {
7444      if (UTF8_IS_INVARIANT(*scan)) {
7445       if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7446                 classnum))))
7447       {
7448        break;
7449       }
7450       scan++;
7451      }
7452      else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7453       if (! (to_complement
7454        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7455                  *(scan + 1)),
7456              classnum))))
7457       {
7458        break;
7459       }
7460       scan += 2;
7461      }
7462      else {
7463       goto found_above_latin1;
7464      }
7465
7466      hardcount++;
7467     }
7468    }
7469    else {
7470     /* For these character classes, the knowledge of how to handle
7471     * every code point is compiled in to Perl via a macro.  This
7472     * code is written for making the loops as tight as possible.
7473     * It could be refactored to save space instead */
7474     switch (classnum) {
7475      case _CC_ENUM_SPACE:    /* XXX would require separate code
7476            if we revert the change of \v
7477            matching this */
7478       /* FALLTHROUGH */
7479      case _CC_ENUM_PSXSPC:
7480       while (hardcount < max
7481        && scan < loceol
7482        && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7483       {
7484        scan += UTF8SKIP(scan);
7485        hardcount++;
7486       }
7487       break;
7488      case _CC_ENUM_BLANK:
7489       while (hardcount < max
7490        && scan < loceol
7491        && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7492       {
7493        scan += UTF8SKIP(scan);
7494        hardcount++;
7495       }
7496       break;
7497      case _CC_ENUM_XDIGIT:
7498       while (hardcount < max
7499        && scan < loceol
7500        && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7501       {
7502        scan += UTF8SKIP(scan);
7503        hardcount++;
7504       }
7505       break;
7506      case _CC_ENUM_VERTSPACE:
7507       while (hardcount < max
7508        && scan < loceol
7509        && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7510       {
7511        scan += UTF8SKIP(scan);
7512        hardcount++;
7513       }
7514       break;
7515      case _CC_ENUM_CNTRL:
7516       while (hardcount < max
7517        && scan < loceol
7518        && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7519       {
7520        scan += UTF8SKIP(scan);
7521        hardcount++;
7522       }
7523       break;
7524      default:
7525       Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7526     }
7527    }
7528   }
7529   break;
7530
7531  found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7532
7533   /* Load the swash if not already present */
7534   if (! PL_utf8_swash_ptrs[classnum]) {
7535    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7536    PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7537           "utf8",
7538           "",
7539           &PL_sv_undef, 1, 0,
7540           PL_XPosix_ptrs[classnum], &flags);
7541   }
7542
7543   while (hardcount < max && scan < loceol
7544    && to_complement ^ cBOOL(_generic_utf8(
7545          classnum,
7546          scan,
7547          swash_fetch(PL_utf8_swash_ptrs[classnum],
7548             (U8 *) scan,
7549             TRUE))))
7550   {
7551    scan += UTF8SKIP(scan);
7552    hardcount++;
7553   }
7554   break;
7555
7556  case LNBREAK:
7557   if (utf8_target) {
7558    while (hardcount < max && scan < loceol &&
7559      (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7560     scan += c;
7561     hardcount++;
7562    }
7563   } else {
7564    /* LNBREAK can match one or two latin chars, which is ok, but we
7565    * have to use hardcount in this situation, and throw away the
7566    * adjustment to <loceol> done before the switch statement */
7567    loceol = reginfo->strend;
7568    while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7569     scan+=c;
7570     hardcount++;
7571    }
7572   }
7573   break;
7574
7575  case BOUND:
7576  case BOUNDA:
7577  case BOUNDL:
7578  case BOUNDU:
7579  case EOS:
7580  case GPOS:
7581  case KEEPS:
7582  case NBOUND:
7583  case NBOUNDA:
7584  case NBOUNDL:
7585  case NBOUNDU:
7586  case OPFAIL:
7587  case SBOL:
7588  case SEOL:
7589   /* These are all 0 width, so match right here or not at all. */
7590   break;
7591
7592  default:
7593   Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7594   /* NOTREACHED */
7595   assert(0);
7596
7597  }
7598
7599  if (hardcount)
7600   c = hardcount;
7601  else
7602   c = scan - *startposp;
7603  *startposp = scan;
7604
7605  DEBUG_r({
7606   GET_RE_DEBUG_FLAGS_DECL;
7607   DEBUG_EXECUTE_r({
7608    SV * const prop = sv_newmortal();
7609    regprop(prog, prop, p, reginfo);
7610    PerlIO_printf(Perl_debug_log,
7611       "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7612       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7613   });
7614  });
7615
7616  return(c);
7617 }
7618
7619
7620 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7621 /*
7622 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7623 create a copy so that changes the caller makes won't change the shared one.
7624 If <altsvp> is non-null, will return NULL in it, for back-compat.
7625  */
7626 SV *
7627 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7628 {
7629  PERL_ARGS_ASSERT_REGCLASS_SWASH;
7630
7631  if (altsvp) {
7632   *altsvp = NULL;
7633  }
7634
7635  return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL));
7636 }
7637
7638 SV *
7639 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
7640           const regnode* node,
7641           bool doinit,
7642           SV** listsvp,
7643           SV** only_utf8_locale_ptr)
7644 {
7645  /* For internal core use only.
7646  * Returns the swash for the input 'node' in the regex 'prog'.
7647  * If <doinit> is 'true', will attempt to create the swash if not already
7648  *   done.
7649  * If <listsvp> is non-null, will return the printable contents of the
7650  *    swash.  This can be used to get debugging information even before the
7651  *    swash exists, by calling this function with 'doinit' set to false, in
7652  *    which case the components that will be used to eventually create the
7653  *    swash are returned  (in a printable form).
7654  * Tied intimately to how regcomp.c sets up the data structure */
7655
7656  SV *sw  = NULL;
7657  SV *si  = NULL;         /* Input swash initialization string */
7658  SV*  invlist = NULL;
7659
7660  RXi_GET_DECL(prog,progi);
7661  const struct reg_data * const data = prog ? progi->data : NULL;
7662
7663  PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
7664
7665  assert(ANYOF_FLAGS(node)
7666       & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
7667
7668  if (data && data->count) {
7669   const U32 n = ARG(node);
7670
7671   if (data->what[n] == 's') {
7672    SV * const rv = MUTABLE_SV(data->data[n]);
7673    AV * const av = MUTABLE_AV(SvRV(rv));
7674    SV **const ary = AvARRAY(av);
7675    U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7676
7677    si = *ary; /* ary[0] = the string to initialize the swash with */
7678
7679    /* Elements 3 and 4 are either both present or both absent. [3] is
7680    * any inversion list generated at compile time; [4] indicates if
7681    * that inversion list has any user-defined properties in it. */
7682    if (av_tindex(av) >= 2) {
7683     if (only_utf8_locale_ptr
7684      && ary[2]
7685      && ary[2] != &PL_sv_undef)
7686     {
7687      *only_utf8_locale_ptr = ary[2];
7688     }
7689     else {
7690      assert(only_utf8_locale_ptr);
7691      *only_utf8_locale_ptr = NULL;
7692     }
7693
7694     if (av_tindex(av) >= 3) {
7695      invlist = ary[3];
7696      if (SvUV(ary[4])) {
7697       swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7698      }
7699     }
7700     else {
7701      invlist = NULL;
7702     }
7703    }
7704
7705    /* Element [1] is reserved for the set-up swash.  If already there,
7706    * return it; if not, create it and store it there */
7707    if (ary[1] && SvROK(ary[1])) {
7708     sw = ary[1];
7709    }
7710    else if (doinit && ((si && si != &PL_sv_undef)
7711         || (invlist && invlist != &PL_sv_undef))) {
7712     assert(si);
7713     sw = _core_swash_init("utf8", /* the utf8 package */
7714          "", /* nameless */
7715          si,
7716          1, /* binary */
7717          0, /* not from tr/// */
7718          invlist,
7719          &swash_init_flags);
7720     (void)av_store(av, 1, sw);
7721    }
7722   }
7723  }
7724
7725  /* If requested, return a printable version of what this swash matches */
7726  if (listsvp) {
7727   SV* matches_string = newSVpvs("");
7728
7729   /* The swash should be used, if possible, to get the data, as it
7730   * contains the resolved data.  But this function can be called at
7731   * compile-time, before everything gets resolved, in which case we
7732   * return the currently best available information, which is the string
7733   * that will eventually be used to do that resolving, 'si' */
7734   if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7735    && (si && si != &PL_sv_undef))
7736   {
7737    sv_catsv(matches_string, si);
7738   }
7739
7740   /* Add the inversion list to whatever we have.  This may have come from
7741   * the swash, or from an input parameter */
7742   if (invlist) {
7743    sv_catsv(matches_string, _invlist_contents(invlist));
7744   }
7745   *listsvp = matches_string;
7746  }
7747
7748  return sw;
7749 }
7750 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
7751
7752 /*
7753  - reginclass - determine if a character falls into a character class
7754
7755   n is the ANYOF regnode
7756   p is the target string
7757   p_end points to one byte beyond the end of the target string
7758   utf8_target tells whether p is in UTF-8.
7759
7760   Returns true if matched; false otherwise.
7761
7762   Note that this can be a synthetic start class, a combination of various
7763   nodes, so things you think might be mutually exclusive, such as locale,
7764   aren't.  It can match both locale and non-locale
7765
7766  */
7767
7768 STATIC bool
7769 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
7770 {
7771  dVAR;
7772  const char flags = ANYOF_FLAGS(n);
7773  bool match = FALSE;
7774  UV c = *p;
7775
7776  PERL_ARGS_ASSERT_REGINCLASS;
7777
7778  /* If c is not already the code point, get it.  Note that
7779  * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7780  if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7781   STRLEN c_len = 0;
7782   c = utf8n_to_uvchr(p, p_end - p, &c_len,
7783     (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7784     | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7785     /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7786     * UTF8_ALLOW_FFFF */
7787   if (c_len == (STRLEN)-1)
7788    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7789  }
7790
7791  /* If this character is potentially in the bitmap, check it */
7792  if (c < 256) {
7793   if (ANYOF_BITMAP_TEST(n, c))
7794    match = TRUE;
7795   else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL
7796     && ! utf8_target
7797     && ! isASCII(c))
7798   {
7799    match = TRUE;
7800   }
7801   else if (flags & ANYOF_LOCALE_FLAGS) {
7802    if (flags & ANYOF_LOC_FOLD) {
7803     if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
7804      match = TRUE;
7805     }
7806    }
7807    if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) {
7808
7809     /* The data structure is arranged so bits 0, 2, 4, ... are set
7810     * if the class includes the Posix character class given by
7811     * bit/2; and 1, 3, 5, ... are set if the class includes the
7812     * complemented Posix class given by int(bit/2).  So we loop
7813     * through the bits, each time changing whether we complement
7814     * the result or not.  Suppose for the sake of illustration
7815     * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7816     * is set, it means there is a match for this ANYOF node if the
7817     * character is in the class given by the expression (0 / 2 = 0
7818     * = \w).  If it is in that class, isFOO_lc() will return 1,
7819     * and since 'to_complement' is 0, the result will stay TRUE,
7820     * and we exit the loop.  Suppose instead that bit 0 is 0, but
7821     * bit 1 is 1.  That means there is a match if the character
7822     * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7823     * but will on bit 1.  On the second iteration 'to_complement'
7824     * will be 1, so the exclusive or will reverse things, so we
7825     * are testing for \W.  On the third iteration, 'to_complement'
7826     * will be 0, and we would be testing for \s; the fourth
7827     * iteration would test for \S, etc.
7828     *
7829     * Note that this code assumes that all the classes are closed
7830     * under folding.  For example, if a character matches \w, then
7831     * its fold does too; and vice versa.  This should be true for
7832     * any well-behaved locale for all the currently defined Posix
7833     * classes, except for :lower: and :upper:, which are handled
7834     * by the pseudo-class :cased: which matches if either of the
7835     * other two does.  To get rid of this assumption, an outer
7836     * loop could be used below to iterate over both the source
7837     * character, and its fold (if different) */
7838
7839     int count = 0;
7840     int to_complement = 0;
7841
7842     while (count < ANYOF_MAX) {
7843      if (ANYOF_POSIXL_TEST(n, count)
7844       && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7845      {
7846       match = TRUE;
7847       break;
7848      }
7849      count++;
7850      to_complement ^= 1;
7851     }
7852    }
7853   }
7854  }
7855
7856
7857  /* If the bitmap didn't (or couldn't) match, and something outside the
7858  * bitmap could match, try that. */
7859  if (!match) {
7860   if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) {
7861    match = TRUE; /* Everything above 255 matches */
7862   }
7863   else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
7864     || (utf8_target && (flags & ANYOF_UTF8))
7865     || ((flags & ANYOF_LOC_FOLD)
7866      && IN_UTF8_CTYPE_LOCALE
7867      && ARG(n) != ANYOF_NONBITMAP_EMPTY))
7868   {
7869    SV* only_utf8_locale = NULL;
7870    SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
7871                &only_utf8_locale);
7872    if (sw) {
7873     U8 utf8_buffer[2];
7874     U8 * utf8_p;
7875     if (utf8_target) {
7876      utf8_p = (U8 *) p;
7877     } else { /* Convert to utf8 */
7878      utf8_p = utf8_buffer;
7879      append_utf8_from_native_byte(*p, &utf8_p);
7880      utf8_p = utf8_buffer;
7881     }
7882
7883     if (swash_fetch(sw, utf8_p, TRUE)) {
7884      match = TRUE;
7885     }
7886    }
7887    if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
7888     match = _invlist_contains_cp(only_utf8_locale, c);
7889    }
7890   }
7891
7892   if (UNICODE_IS_SUPER(c)
7893    && (flags & ANYOF_WARN_SUPER)
7894    && ckWARN_d(WARN_NON_UNICODE))
7895   {
7896    Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7897     "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
7898   }
7899  }
7900
7901 #if ANYOF_INVERT != 1
7902  /* Depending on compiler optimization cBOOL takes time, so if don't have to
7903  * use it, don't */
7904 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
7905 #endif
7906
7907  /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7908  return (flags & ANYOF_INVERT) ^ match;
7909 }
7910
7911 STATIC U8 *
7912 S_reghop3(U8 *s, SSize_t off, const U8* lim)
7913 {
7914  /* return the position 'off' UTF-8 characters away from 's', forward if
7915  * 'off' >= 0, backwards if negative.  But don't go outside of position
7916  * 'lim', which better be < s  if off < 0 */
7917
7918  PERL_ARGS_ASSERT_REGHOP3;
7919
7920  if (off >= 0) {
7921   while (off-- && s < lim) {
7922    /* XXX could check well-formedness here */
7923    s += UTF8SKIP(s);
7924   }
7925  }
7926  else {
7927   while (off++ && s > lim) {
7928    s--;
7929    if (UTF8_IS_CONTINUED(*s)) {
7930     while (s > lim && UTF8_IS_CONTINUATION(*s))
7931      s--;
7932    }
7933    /* XXX could check well-formedness here */
7934   }
7935  }
7936  return s;
7937 }
7938
7939 STATIC U8 *
7940 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
7941 {
7942  PERL_ARGS_ASSERT_REGHOP4;
7943
7944  if (off >= 0) {
7945   while (off-- && s < rlim) {
7946    /* XXX could check well-formedness here */
7947    s += UTF8SKIP(s);
7948   }
7949  }
7950  else {
7951   while (off++ && s > llim) {
7952    s--;
7953    if (UTF8_IS_CONTINUED(*s)) {
7954     while (s > llim && UTF8_IS_CONTINUATION(*s))
7955      s--;
7956    }
7957    /* XXX could check well-formedness here */
7958   }
7959  }
7960  return s;
7961 }
7962
7963 /* like reghop3, but returns NULL on overrun, rather than returning last
7964  * char pos */
7965
7966 STATIC U8 *
7967 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
7968 {
7969  PERL_ARGS_ASSERT_REGHOPMAYBE3;
7970
7971  if (off >= 0) {
7972   while (off-- && s < lim) {
7973    /* XXX could check well-formedness here */
7974    s += UTF8SKIP(s);
7975   }
7976   if (off >= 0)
7977    return NULL;
7978  }
7979  else {
7980   while (off++ && s > lim) {
7981    s--;
7982    if (UTF8_IS_CONTINUED(*s)) {
7983     while (s > lim && UTF8_IS_CONTINUATION(*s))
7984      s--;
7985    }
7986    /* XXX could check well-formedness here */
7987   }
7988   if (off <= 0)
7989    return NULL;
7990  }
7991  return s;
7992 }
7993
7994
7995 /* when executing a regex that may have (?{}), extra stuff needs setting
7996    up that will be visible to the called code, even before the current
7997    match has finished. In particular:
7998
7999    * $_ is localised to the SV currently being matched;
8000    * pos($_) is created if necessary, ready to be updated on each call-out
8001  to code;
8002    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
8003  isn't set until the current pattern is successfully finished), so that
8004  $1 etc of the match-so-far can be seen;
8005    * save the old values of subbeg etc of the current regex, and  set then
8006  to the current string (again, this is normally only done at the end
8007  of execution)
8008 */
8009
8010 static void
8011 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
8012 {
8013  MAGIC *mg;
8014  regexp *const rex = ReANY(reginfo->prog);
8015  regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8016
8017  eval_state->rex = rex;
8018
8019  if (reginfo->sv) {
8020   /* Make $_ available to executed code. */
8021   if (reginfo->sv != DEFSV) {
8022    SAVE_DEFSV;
8023    DEFSV_set(reginfo->sv);
8024   }
8025
8026   if (!(mg = mg_find_mglob(reginfo->sv))) {
8027    /* prepare for quick setting of pos */
8028    mg = sv_magicext_mglob(reginfo->sv);
8029    mg->mg_len = -1;
8030   }
8031   eval_state->pos_magic = mg;
8032   eval_state->pos       = mg->mg_len;
8033   eval_state->pos_flags = mg->mg_flags;
8034  }
8035  else
8036   eval_state->pos_magic = NULL;
8037
8038  if (!PL_reg_curpm) {
8039   /* PL_reg_curpm is a fake PMOP that we can attach the current
8040   * regex to and point PL_curpm at, so that $1 et al are visible
8041   * within a /(?{})/. It's just allocated once per interpreter the
8042   * first time its needed */
8043   Newxz(PL_reg_curpm, 1, PMOP);
8044 #ifdef USE_ITHREADS
8045   {
8046    SV* const repointer = &PL_sv_undef;
8047    /* this regexp is also owned by the new PL_reg_curpm, which
8048    will try to free it.  */
8049    av_push(PL_regex_padav, repointer);
8050    PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
8051    PL_regex_pad = AvARRAY(PL_regex_padav);
8052   }
8053 #endif
8054  }
8055  SET_reg_curpm(reginfo->prog);
8056  eval_state->curpm = PL_curpm;
8057  PL_curpm = PL_reg_curpm;
8058  if (RXp_MATCH_COPIED(rex)) {
8059   /*  Here is a serious problem: we cannot rewrite subbeg,
8060    since it may be needed if this match fails.  Thus
8061    $` inside (?{}) could fail... */
8062   eval_state->subbeg     = rex->subbeg;
8063   eval_state->sublen     = rex->sublen;
8064   eval_state->suboffset  = rex->suboffset;
8065   eval_state->subcoffset = rex->subcoffset;
8066 #ifdef PERL_ANY_COW
8067   eval_state->saved_copy = rex->saved_copy;
8068 #endif
8069   RXp_MATCH_COPIED_off(rex);
8070  }
8071  else
8072   eval_state->subbeg = NULL;
8073  rex->subbeg = (char *)reginfo->strbeg;
8074  rex->suboffset = 0;
8075  rex->subcoffset = 0;
8076  rex->sublen = reginfo->strend - reginfo->strbeg;
8077 }
8078
8079
8080 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
8081
8082 static void
8083 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
8084 {
8085  regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8086  regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
8087  regmatch_slab *s;
8088
8089  Safefree(aux->poscache);
8090
8091  if (eval_state) {
8092
8093   /* undo the effects of S_setup_eval_state() */
8094
8095   if (eval_state->subbeg) {
8096    regexp * const rex = eval_state->rex;
8097    rex->subbeg     = eval_state->subbeg;
8098    rex->sublen     = eval_state->sublen;
8099    rex->suboffset  = eval_state->suboffset;
8100    rex->subcoffset = eval_state->subcoffset;
8101 #ifdef PERL_ANY_COW
8102    rex->saved_copy = eval_state->saved_copy;
8103 #endif
8104    RXp_MATCH_COPIED_on(rex);
8105   }
8106   if (eval_state->pos_magic)
8107   {
8108    eval_state->pos_magic->mg_len = eval_state->pos;
8109    eval_state->pos_magic->mg_flags =
8110     (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8111    | (eval_state->pos_flags & MGf_BYTES);
8112   }
8113
8114   PL_curpm = eval_state->curpm;
8115  }
8116
8117  PL_regmatch_state = aux->old_regmatch_state;
8118  PL_regmatch_slab  = aux->old_regmatch_slab;
8119
8120  /* free all slabs above current one - this must be the last action
8121  * of this function, as aux and eval_state are allocated within
8122  * slabs and may be freed here */
8123
8124  s = PL_regmatch_slab->next;
8125  if (s) {
8126   PL_regmatch_slab->next = NULL;
8127   while (s) {
8128    regmatch_slab * const osl = s;
8129    s = s->next;
8130    Safefree(osl);
8131   }
8132  }
8133 }
8134
8135
8136 STATIC void
8137 S_to_utf8_substr(pTHX_ regexp *prog)
8138 {
8139  /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8140  * on the converted value */
8141
8142  int i = 1;
8143
8144  PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8145
8146  do {
8147   if (prog->substrs->data[i].substr
8148    && !prog->substrs->data[i].utf8_substr) {
8149    SV* const sv = newSVsv(prog->substrs->data[i].substr);
8150    prog->substrs->data[i].utf8_substr = sv;
8151    sv_utf8_upgrade(sv);
8152    if (SvVALID(prog->substrs->data[i].substr)) {
8153     if (SvTAIL(prog->substrs->data[i].substr)) {
8154      /* Trim the trailing \n that fbm_compile added last
8155      time.  */
8156      SvCUR_set(sv, SvCUR(sv) - 1);
8157      /* Whilst this makes the SV technically "invalid" (as its
8158      buffer is no longer followed by "\0") when fbm_compile()
8159      adds the "\n" back, a "\0" is restored.  */
8160      fbm_compile(sv, FBMcf_TAIL);
8161     } else
8162      fbm_compile(sv, 0);
8163    }
8164    if (prog->substrs->data[i].substr == prog->check_substr)
8165     prog->check_utf8 = sv;
8166   }
8167  } while (i--);
8168 }
8169
8170 STATIC bool
8171 S_to_byte_substr(pTHX_ regexp *prog)
8172 {
8173  /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8174  * on the converted value; returns FALSE if can't be converted. */
8175
8176  int i = 1;
8177
8178  PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8179
8180  do {
8181   if (prog->substrs->data[i].utf8_substr
8182    && !prog->substrs->data[i].substr) {
8183    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
8184    if (! sv_utf8_downgrade(sv, TRUE)) {
8185     return FALSE;
8186    }
8187    if (SvVALID(prog->substrs->data[i].utf8_substr)) {
8188     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
8189      /* Trim the trailing \n that fbm_compile added last
8190       time.  */
8191      SvCUR_set(sv, SvCUR(sv) - 1);
8192      fbm_compile(sv, FBMcf_TAIL);
8193     } else
8194      fbm_compile(sv, 0);
8195    }
8196    prog->substrs->data[i].substr = sv;
8197    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
8198     prog->check_substr = sv;
8199   }
8200  } while (i--);
8201
8202  return TRUE;
8203 }
8204
8205 /*
8206  * Local variables:
8207  * c-indentation-style: bsd
8208  * c-basic-offset: 4
8209  * indent-tabs-mode: nil
8210  * End:
8211  *
8212  * ex: set ts=8 sts=4 sw=4 et:
8213  */