src/5020001/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *  One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  * Copyright (c) 1986 by University of Toronto.
  44  * Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  * Permission is granted to anyone to use this software for any
  47  * purpose on any computer system, and to redistribute it freely,
  48  * subject to the following restrictions:
  49  *
  50  * 1. The author is not responsible for the consequences of use of
  51  *  this software, no matter how awful, even if they arise
  52  *  from defects in it.
  53  *
  54  * 2. The origin of this software must not be misrepresented, either
  55  *  by explicit claim or by omission.
  56  *
  57  * 3. Altered versions must be plainly marked as such, and must not
  58  *  be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #undef PERL_IN_XSUB_RE
  76 #define PERL_IN_XSUB_RE 1
  77 #include "perl.h"
  78 #include "re_defs.h"
  79 #undef PERL_IN_XSUB_RE
  80
  81 #ifdef PERL_IN_XSUB_RE
  82 #  include "re_comp.h"
  83 #else
  84 #  include "regcomp.h"
  85 #endif
  86
  87 #include "inline_invlist.c"
  88 #include "unicode_constants.h"
  89
  90 #ifdef DEBUGGING
  91 /* At least one required character in the target string is expressible only in
  92  * UTF-8. */
  93 static const char* const non_utf8_target_but_utf8_required
  94     = "Can't match, because target string needs to be in UTF-8\n";
  95 #endif
  96
  97 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
  98  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
  99  goto target; \
 100 } STMT_END
 101
 102 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 103
 104 #ifndef STATIC
 105 #define STATIC static
 106 #endif
 107
 108 /* Valid only for non-utf8 strings: avoids the reginclass
 109  * call if there are no complications: i.e., if everything matchable is
 110  * straight forward in the bitmap */
 111 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0)   \
 112            : ANYOF_BITMAP_TEST(p,*(c)))
 113
 114 /*
 115  * Forwards.
 116  */
 117
 118 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 119 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 120
 121 #define HOPc(pos,off) \
 122   (char *)(reginfo->is_utf8_target \
 123    ? reghop3((U8*)pos, off, \
 124      (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 125    : (U8*)(pos + off))
 126
 127 #define HOPBACKc(pos, off) \
 128   (char*)(reginfo->is_utf8_target \
 129    ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 130    : (pos - off >= reginfo->strbeg) \
 131     ? (U8*)pos - off  \
 132     : NULL)
 133
 134 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 135 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 136
 137 /* lim must be +ve. Returns NULL on overshoot */
 138 #define HOPMAYBE3(pos,off,lim) \
 139   (reginfo->is_utf8_target                        \
 140    ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 141    : ((U8*)pos + off <= lim)                   \
 142     ? (U8*)pos + off                        \
 143     : NULL)
 144
 145 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 146  * off must be >=0; args should be vars rather than expressions */
 147 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 148  ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 149  : (U8*)((pos + off) > lim ? lim : (pos + off)))
 150
 151 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 152  ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 153  : (U8*)(pos + off))
 154 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 155
 156 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 157 #define NEXTCHR_IS_EOS (nextchr < 0)
 158
 159 #define SET_nextchr \
 160  nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 161
 162 #define SET_locinput(p) \
 163  locinput = (p);  \
 164  SET_nextchr
 165
 166
 167 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 168   if (!swash_ptr) {                                                     \
 169    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 170    swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 171           1, 0, invlist, &flags);              \
 172    assert(swash_ptr);                                                \
 173   }                                                                     \
 174  } STMT_END
 175
 176 /* If in debug mode, we test that a known character properly matches */
 177 #ifdef DEBUGGING
 178 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 179           property_name,                      \
 180           invlist,                            \
 181           utf8_char_in_property)              \
 182   LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 183   assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 184 #else
 185 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 186           property_name,                      \
 187           invlist,                            \
 188           utf8_char_in_property)              \
 189   LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 190 #endif
 191
 192 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 193           PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 194           "",                                   \
 195           PL_XPosix_ptrs[_CC_WORDCHAR],         \
 196           LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
 197
 198 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */          \
 199  STMT_START {                                                              \
 200   LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin,               \
 201          "_X_regular_begin",                    \
 202          NULL,                                  \
 203          LATIN_CAPITAL_LETTER_SHARP_S_UTF8);    \
 204   LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend,                      \
 205          "_X_extend",                           \
 206          NULL,                                  \
 207          COMBINING_GRAVE_ACCENT_UTF8);          \
 208  } STMT_END
 209
 210 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
 211 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 212
 213 /* for use after a quantifier and before an EXACT-like node -- japhy */
 214 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 215  *
 216  * NOTE that *nothing* that affects backtracking should be in here, specifically
 217  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 218  * node that is in between two EXACT like nodes when ascertaining what the required
 219  * "follow" character is. This should probably be moved to regex compile time
 220  * although it may be done at run time beause of the REF possibility - more
 221  * investigation required. -- demerphq
 222 */
 223 #define JUMPABLE(rn) (                                                             \
 224  OP(rn) == OPEN ||                                                              \
 225  (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 226  OP(rn) == EVAL ||                                                              \
 227  OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 228  OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 229  OP(rn) == KEEPS ||                                                             \
 230  (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 231 )
 232 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 233
 234 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 235
 236 #if 0
 237 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 238    we don't need this definition. */
 239 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 240 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 241 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 242
 243 #else
 244 /* ... so we use this as its faster. */
 245 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 246 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 247 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 248 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 249
 250 #endif
 251
 252 /*
 253   Search for mandatory following text node; for lookahead, the text must
 254   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 255 */
 256 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 257  while (JUMPABLE(rn)) { \
 258   const OPCODE type = OP(rn); \
 259   if (type == SUSPEND || PL_regkind[type] == CURLY) \
 260    rn = NEXTOPER(NEXTOPER(rn)); \
 261   else if (type == PLUS) \
 262    rn = NEXTOPER(rn); \
 263   else if (type == IFMATCH) \
 264    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 265   else rn += NEXT_OFF(rn); \
 266  } \
 267 } STMT_END
 268
 269 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
 270  * These are for the pre-composed Hangul syllables, which are all in a
 271  * contiguous block and arranged there in such a way so as to facilitate
 272  * alorithmic determination of their characteristics.  As such, they don't need
 273  * a swash, but can be determined by simple arithmetic.  Almost all are
 274  * GCB=LVT, but every 28th one is a GCB=LV */
 275 #define SBASE 0xAC00    /* Start of block */
 276 #define SCount 11172    /* Length of block */
 277 #define TCount 28
 278
 279 #define SLAB_FIRST(s) (&(s)->states[0])
 280 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 281
 282 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 283 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 284 static regmatch_state * S_push_slab(pTHX);
 285
 286 #define REGCP_PAREN_ELEMS 3
 287 #define REGCP_OTHER_ELEMS 3
 288 #define REGCP_FRAME_ELEMS 1
 289 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 290  * are needed for the regexp context stack bookkeeping. */
 291
 292 STATIC CHECKPOINT
 293 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 294 {
 295  dVAR;
 296  const int retval = PL_savestack_ix;
 297  const int paren_elems_to_push =
 298     (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 299  const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 300  const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 301  I32 p;
 302  GET_RE_DEBUG_FLAGS_DECL;
 303
 304  PERL_ARGS_ASSERT_REGCPPUSH;
 305
 306  if (paren_elems_to_push < 0)
 307   Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i",
 308     paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS);
 309
 310  if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 311   Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 312     " out of range (%lu-%ld)",
 313     total_elems,
 314     (unsigned long)maxopenparen,
 315     (long)parenfloor);
 316
 317  SSGROW(total_elems + REGCP_FRAME_ELEMS);
 318
 319  DEBUG_BUFFERS_r(
 320   if ((int)maxopenparen > (int)parenfloor)
 321    PerlIO_printf(Perl_debug_log,
 322     "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 323     PTR2UV(rex),
 324     PTR2UV(rex->offs)
 325    );
 326  );
 327  for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 328 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 329   SSPUSHIV(rex->offs[p].end);
 330   SSPUSHIV(rex->offs[p].start);
 331   SSPUSHINT(rex->offs[p].start_tmp);
 332   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 333    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 334    (UV)p,
 335    (IV)rex->offs[p].start,
 336    (IV)rex->offs[p].start_tmp,
 337    (IV)rex->offs[p].end
 338   ));
 339  }
 340 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 341  SSPUSHINT(maxopenparen);
 342  SSPUSHINT(rex->lastparen);
 343  SSPUSHINT(rex->lastcloseparen);
 344  SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 345
 346  return retval;
 347 }
 348
 349 /* These are needed since we do not localize EVAL nodes: */
 350 #define REGCP_SET(cp)                                           \
 351  DEBUG_STATE_r(                                              \
 352    PerlIO_printf(Perl_debug_log,          \
 353     "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 354     (IV)PL_savestack_ix));                          \
 355  cp = PL_savestack_ix
 356
 357 #define REGCP_UNWIND(cp)                                        \
 358  DEBUG_STATE_r(                                              \
 359   if (cp != PL_savestack_ix)                   \
 360     PerlIO_printf(Perl_debug_log,          \
 361     "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 362     (IV)(cp), (IV)PL_savestack_ix));                \
 363  regcpblow(cp)
 364
 365 #define UNWIND_PAREN(lp, lcp)               \
 366  for (n = rex->lastparen; n > lp; n--)   \
 367   rex->offs[n].end = -1;              \
 368  rex->lastparen = n;                     \
 369  rex->lastcloseparen = lcp;
 370
 371
 372 STATIC void
 373 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 374 {
 375  dVAR;
 376  UV i;
 377  U32 paren;
 378  GET_RE_DEBUG_FLAGS_DECL;
 379
 380  PERL_ARGS_ASSERT_REGCPPOP;
 381
 382  /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 383  i = SSPOPUV;
 384  assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 385  i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 386  rex->lastcloseparen = SSPOPINT;
 387  rex->lastparen = SSPOPINT;
 388  *maxopenparen_p = SSPOPINT;
 389
 390  i -= REGCP_OTHER_ELEMS;
 391  /* Now restore the parentheses context. */
 392  DEBUG_BUFFERS_r(
 393   if (i || rex->lastparen + 1 <= rex->nparens)
 394    PerlIO_printf(Perl_debug_log,
 395     "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 396     PTR2UV(rex),
 397     PTR2UV(rex->offs)
 398    );
 399  );
 400  paren = *maxopenparen_p;
 401  for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 402   SSize_t tmps;
 403   rex->offs[paren].start_tmp = SSPOPINT;
 404   rex->offs[paren].start = SSPOPIV;
 405   tmps = SSPOPIV;
 406   if (paren <= rex->lastparen)
 407    rex->offs[paren].end = tmps;
 408   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 409    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 410    (UV)paren,
 411    (IV)rex->offs[paren].start,
 412    (IV)rex->offs[paren].start_tmp,
 413    (IV)rex->offs[paren].end,
 414    (paren > rex->lastparen ? "(skipped)" : ""));
 415   );
 416   paren--;
 417  }
 418 #if 1
 419  /* It would seem that the similar code in regtry()
 420  * already takes care of this, and in fact it is in
 421  * a better location to since this code can #if 0-ed out
 422  * but the code in regtry() is needed or otherwise tests
 423  * requiring null fields (pat.t#187 and split.t#{13,14}
 424  * (as of patchlevel 7877)  will fail.  Then again,
 425  * this code seems to be necessary or otherwise
 426  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 427  * --jhi updated by dapm */
 428  for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 429   if (i > *maxopenparen_p)
 430    rex->offs[i].start = -1;
 431   rex->offs[i].end = -1;
 432   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 433    "    \\%"UVuf": %s   ..-1 undeffing\n",
 434    (UV)i,
 435    (i > *maxopenparen_p) ? "-1" : "  "
 436   ));
 437  }
 438 #endif
 439 }
 440
 441 /* restore the parens and associated vars at savestack position ix,
 442  * but without popping the stack */
 443
 444 STATIC void
 445 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 446 {
 447  I32 tmpix = PL_savestack_ix;
 448  PL_savestack_ix = ix;
 449  regcppop(rex, maxopenparen_p);
 450  PL_savestack_ix = tmpix;
 451 }
 452
 453 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
 454
 455 STATIC bool
 456 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 457 {
 458  /* Returns a boolean as to whether or not 'character' is a member of the
 459  * Posix character class given by 'classnum' that should be equivalent to a
 460  * value in the typedef '_char_class_number'.
 461  *
 462  * Ideally this could be replaced by a just an array of function pointers
 463  * to the C library functions that implement the macros this calls.
 464  * However, to compile, the precise function signatures are required, and
 465  * these may vary from platform to to platform.  To avoid having to figure
 466  * out what those all are on each platform, I (khw) am using this method,
 467  * which adds an extra layer of function call overhead (unless the C
 468  * optimizer strips it away).  But we don't particularly care about
 469  * performance with locales anyway. */
 470
 471  switch ((_char_class_number) classnum) {
 472   case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 473   case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 474   case _CC_ENUM_ASCII:     return isASCII_LC(character);
 475   case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 476   case _CC_ENUM_CASED:     return isLOWER_LC(character)
 477           || isUPPER_LC(character);
 478   case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 479   case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 480   case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 481   case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 482   case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 483   case _CC_ENUM_PSXSPC:    return isPSXSPC_LC(character);
 484   case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 485   case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 486   case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 487   case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 488   case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 489   default:    /* VERTSPACE should never occur in locales */
 490    Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 491  }
 492
 493  assert(0); /* NOTREACHED */
 494  return FALSE;
 495 }
 496
 497 STATIC bool
 498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 499 {
 500  /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 501  * 'character' is a member of the Posix character class given by 'classnum'
 502  * that should be equivalent to a value in the typedef
 503  * '_char_class_number'.
 504  *
 505  * This just calls isFOO_lc on the code point for the character if it is in
 506  * the range 0-255.  Outside that range, all characters avoid Unicode
 507  * rules, ignoring any locale.  So use the Unicode function if this class
 508  * requires a swash, and use the Unicode macro otherwise. */
 509
 510  PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 511
 512  if (UTF8_IS_INVARIANT(*character)) {
 513   return isFOO_lc(classnum, *character);
 514  }
 515  else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 516   return isFOO_lc(classnum,
 517       TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
 518  }
 519
 520  if (classnum < _FIRST_NON_SWASH_CC) {
 521
 522   /* Initialize the swash unless done already */
 523   if (! PL_utf8_swash_ptrs[classnum]) {
 524    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 525    PL_utf8_swash_ptrs[classnum] =
 526      _core_swash_init("utf8",
 527          "",
 528          &PL_sv_undef, 1, 0,
 529          PL_XPosix_ptrs[classnum], &flags);
 530   }
 531
 532   return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 533         character,
 534         TRUE /* is UTF */ ));
 535  }
 536
 537  switch ((_char_class_number) classnum) {
 538   case _CC_ENUM_SPACE:
 539   case _CC_ENUM_PSXSPC:    return is_XPERLSPACE_high(character);
 540
 541   case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 542   case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 543   case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 544   default:                 return 0;  /* Things like CNTRL are always
 545            below 256 */
 546  }
 547
 548  assert(0); /* NOTREACHED */
 549  return FALSE;
 550 }
 551
 552 /*
 553  * pregexec and friends
 554  */
 555
 556 #ifndef PERL_IN_XSUB_RE
 557 /*
 558  - pregexec - match a regexp against a string
 559  */
 560 I32
 561 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 562   char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 563 /* stringarg: the point in the string at which to begin matching */
 564 /* strend:    pointer to null at end of string */
 565 /* strbeg:    real beginning of string */
 566 /* minend:    end of match must be >= minend bytes after stringarg. */
 567 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 568  *            itself is accessed via the pointers above */
 569 /* nosave:    For optimizations. */
 570 {
 571  PERL_ARGS_ASSERT_PREGEXEC;
 572
 573  return
 574   regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 575      nosave ? 0 : REXEC_COPY_STR);
 576 }
 577 #endif
 578
 579
 580
 581 /* re_intuit_start():
 582  *
 583  * Based on some optimiser hints, try to find the earliest position in the
 584  * string where the regex could match.
 585  *
 586  *   rx:     the regex to match against
 587  *   sv:     the SV being matched: only used for utf8 flag; the string
 588  *           itself is accessed via the pointers below. Note that on
 589  *           something like an overloaded SV, SvPOK(sv) may be false
 590  *           and the string pointers may point to something unrelated to
 591  *           the SV itself.
 592  *   strbeg: real beginning of string
 593  *   strpos: the point in the string at which to begin matching
 594  *   strend: pointer to the byte following the last char of the string
 595  *   flags   currently unused; set to 0
 596  *   data:   currently unused; set to NULL
 597  *
 598  * The basic idea of re_intuit_start() is to use some known information
 599  * about the pattern, namely:
 600  *
 601  *   a) the longest known anchored substring (i.e. one that's at a
 602  *      constant offset from the beginning of the pattern; but not
 603  *      necessarily at a fixed offset from the beginning of the
 604  *      string);
 605  *   b) the longest floating substring (i.e. one that's not at a constant
 606  *      offset from the beginning of the pattern);
 607  *   c) Whether the pattern is anchored to the string; either
 608  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 609  *      or anchored to pos(): /\G/;
 610  *   d) A start class: a real or synthetic character class which
 611  *      represents which characters are legal at the start of the pattern;
 612  *
 613  * to either quickly reject the match, or to find the earliest position
 614  * within the string at which the pattern might match, thus avoiding
 615  * running the full NFA engine at those earlier locations, only to
 616  * eventually fail and retry further along.
 617  *
 618  * Returns NULL if the pattern can't match, or returns the address within
 619  * the string which is the earliest place the match could occur.
 620  *
 621  * The longest of the anchored and floating substrings is called 'check'
 622  * and is checked first. The other is called 'other' and is checked
 623  * second. The 'other' substring may not be present.  For example,
 624  *
 625  *    /(abc|xyz)ABC\d{0,3}DEFG/
 626  *
 627  * will have
 628  *
 629  *   check substr (float)    = "DEFG", offset 6..9 chars
 630  *   other substr (anchored) = "ABC",  offset 3..3 chars
 631  *   stclass = [ax]
 632  *
 633  * Be aware that during the course of this function, sometimes 'anchored'
 634  * refers to a substring being anchored relative to the start of the
 635  * pattern, and sometimes to the pattern itself being anchored relative to
 636  * the string. For example:
 637  *
 638  *   /\dabc/:   "abc" is anchored to the pattern;
 639  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 640  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 641  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 642  *                    but the pattern is anchored to the string.
 643  */
 644
 645 char *
 646 Perl_re_intuit_start(pTHX_
 647      REGEXP * const rx,
 648      SV *sv,
 649      const char * const strbeg,
 650      char *strpos,
 651      char *strend,
 652      const U32 flags,
 653      re_scream_pos_data *data)
 654 {
 655  dVAR;
 656  struct regexp *const prog = ReANY(rx);
 657  SSize_t start_shift = prog->check_offset_min;
 658  /* Should be nonnegative! */
 659  SSize_t end_shift   = 0;
 660  /* current lowest pos in string where the regex can start matching */
 661  char *rx_origin = strpos;
 662  SV *check;
 663  const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 664  U8   other_ix = 1 - prog->substrs->check_ix;
 665  bool ml_anch = 0;
 666  char *other_last = strpos;/* latest pos 'other' substr already checked to */
 667  char *check_at = NULL;  /* check substr found at this pos */
 668  const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 669  RXi_GET_DECL(prog,progi);
 670  regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 671  regmatch_info *const reginfo = &reginfo_buf;
 672  GET_RE_DEBUG_FLAGS_DECL;
 673
 674  PERL_ARGS_ASSERT_RE_INTUIT_START;
 675  PERL_UNUSED_ARG(flags);
 676  PERL_UNUSED_ARG(data);
 677
 678  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 679     "Intuit: trying to determine minimum start position...\n"));
 680
 681  /* for now, assume that all substr offsets are positive. If at some point
 682  * in the future someone wants to do clever things with look-behind and
 683  * -ve offsets, they'll need to fix up any code in this function
 684  * which uses these offsets. See the thread beginning
 685  * <20140113145929.GF27210@iabyn.com>
 686  */
 687  assert(prog->substrs->data[0].min_offset >= 0);
 688  assert(prog->substrs->data[0].max_offset >= 0);
 689  assert(prog->substrs->data[1].min_offset >= 0);
 690  assert(prog->substrs->data[1].max_offset >= 0);
 691  assert(prog->substrs->data[2].min_offset >= 0);
 692  assert(prog->substrs->data[2].max_offset >= 0);
 693
 694  /* for now, assume that if both present, that the floating substring
 695  * doesn't start before the anchored substring.
 696  * If you break this assumption (e.g. doing better optimisations
 697  * with lookahead/behind), then you'll need to audit the code in this
 698  * function carefully first
 699  */
 700  assert(
 701    ! (  (prog->anchored_utf8 || prog->anchored_substr)
 702    && (prog->float_utf8    || prog->float_substr))
 703   || (prog->float_min_offset >= prog->anchored_offset));
 704
 705  /* byte rather than char calculation for efficiency. It fails
 706  * to quickly reject some cases that can't match, but will reject
 707  * them later after doing full char arithmetic */
 708  if (prog->minlen > strend - strpos) {
 709   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 710        "  String too short...\n"));
 711   goto fail;
 712  }
 713
 714  reginfo->is_utf8_target = cBOOL(utf8_target);
 715  reginfo->info_aux = NULL;
 716  reginfo->strbeg = strbeg;
 717  reginfo->strend = strend;
 718  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 719  reginfo->intuit = 1;
 720  /* not actually used within intuit, but zero for safety anyway */
 721  reginfo->poscache_maxiter = 0;
 722
 723  if (utf8_target) {
 724   if (!prog->check_utf8 && prog->check_substr)
 725    to_utf8_substr(prog);
 726   check = prog->check_utf8;
 727  } else {
 728   if (!prog->check_substr && prog->check_utf8) {
 729    if (! to_byte_substr(prog)) {
 730     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 731    }
 732   }
 733   check = prog->check_substr;
 734  }
 735
 736  /* dump the various substring data */
 737  DEBUG_OPTIMISE_MORE_r({
 738   int i;
 739   for (i=0; i<=2; i++) {
 740    SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 741         : prog->substrs->data[i].substr);
 742    if (!sv)
 743     continue;
 744
 745    PerlIO_printf(Perl_debug_log,
 746     "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 747     " useful=%"IVdf" utf8=%d [%s]\n",
 748     i,
 749     (IV)prog->substrs->data[i].min_offset,
 750     (IV)prog->substrs->data[i].max_offset,
 751     (IV)prog->substrs->data[i].end_shift,
 752     BmUSEFUL(sv),
 753     utf8_target ? 1 : 0,
 754     SvPEEK(sv));
 755   }
 756  });
 757
 758  if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 759
 760   /* ml_anch: check after \n?
 761   *
 762   * A note about IMPLICIT: on an un-anchored pattern beginning
 763   * with /.*.../, these flags will have been added by the
 764   * compiler:
 765   *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 766   *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 767   */
 768   ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 769     && !(prog->intflags & PREGf_IMPLICIT);
 770
 771   if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 772    /* we are only allowed to match at BOS or \G */
 773
 774    /* trivially reject if there's a BOS anchor and we're not at BOS.
 775    *
 776    * Note that we don't try to do a similar quick reject for
 777    * \G, since generally the caller will have calculated strpos
 778    * based on pos() and gofs, so the string is already correctly
 779    * anchored by definition; and handling the exceptions would
 780    * be too fiddly (e.g. REXEC_IGNOREPOS).
 781    */
 782    if (   strpos != strbeg
 783     && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL)))
 784    {
 785     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 786         "  Not at start...\n"));
 787     goto fail;
 788    }
 789
 790    /* in the presence of an anchor, the anchored (relative to the
 791    * start of the regex) substr must also be anchored relative
 792    * to strpos. So quickly reject if substr isn't found there.
 793    * This works for \G too, because the caller will already have
 794    * subtracted gofs from pos, and gofs is the offset from the
 795    * \G to the start of the regex. For example, in /.abc\Gdef/,
 796    * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 797    * caller will have set strpos=pos()-4; we look for the substr
 798    * at position pos()-4+1, which lines up with the "a" */
 799
 800    if (prog->check_offset_min == prog->check_offset_max
 801     && !(prog->intflags & PREGf_CANY_SEEN))
 802    {
 803     /* Substring at constant offset from beg-of-str... */
 804     SSize_t slen = SvCUR(check);
 805     char *s = HOP3c(strpos, prog->check_offset_min, strend);
 806
 807     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 808      "  Looking for check substr at fixed offset %"IVdf"...\n",
 809      (IV)prog->check_offset_min));
 810
 811     if (SvTAIL(check)) {
 812      /* In this case, the regex is anchored at the end too.
 813      * Unless it's a multiline match, the lengths must match
 814      * exactly, give or take a \n.  NB: slen >= 1 since
 815      * the last char of check is \n */
 816      if (!multiline
 817       && (   strend - s > slen
 818        || strend - s < slen - 1
 819        || (strend - s == slen && strend[-1] != '\n')))
 820      {
 821       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 822            "  String too long...\n"));
 823       goto fail_finish;
 824      }
 825      /* Now should match s[0..slen-2] */
 826      slen--;
 827     }
 828     if (slen && (*SvPVX_const(check) != *s
 829      || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
 830     {
 831      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 832          "  String not equal...\n"));
 833      goto fail_finish;
 834     }
 835
 836     check_at = s;
 837     goto success_at_start;
 838    }
 839   }
 840  }
 841
 842  end_shift = prog->check_end_shift;
 843
 844 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
 845  if (end_shift < 0)
 846   Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 847     (IV)end_shift, RX_PRECOMP(prog));
 848 #endif
 849
 850   restart:
 851
 852  /* This is the (re)entry point of the main loop in this function.
 853  * The goal of this loop is to:
 854  * 1) find the "check" substring in the region rx_origin..strend
 855  *    (adjusted by start_shift / end_shift). If not found, reject
 856  *    immediately.
 857  * 2) If it exists, look for the "other" substr too if defined; for
 858  *    example, if the check substr maps to the anchored substr, then
 859  *    check the floating substr, and vice-versa. If not found, go
 860  *    back to (1) with rx_origin suitably incremented.
 861  * 3) If we find an rx_origin position that doesn't contradict
 862  *    either of the substrings, then check the possible additional
 863  *    constraints on rx_origin of /^.../m or a known start class.
 864  *    If these fail, then depending on which constraints fail, jump
 865  *    back to here, or to various other re-entry points further along
 866  *    that skip some of the first steps.
 867  * 4) If we pass all those tests, update the BmUSEFUL() count on the
 868  *    substring. If the start position was determined to be at the
 869  *    beginning of the string  - so, not rejected, but not optimised,
 870  *    since we have to run regmatch from position 0 - decrement the
 871  *    BmUSEFUL() count. Otherwise increment it.
 872  */
 873
 874
 875  /* first, look for the 'check' substring */
 876
 877  {
 878   U8* start_point;
 879   U8* end_point;
 880
 881   DEBUG_OPTIMISE_MORE_r({
 882    PerlIO_printf(Perl_debug_log,
 883     "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 884     " Start shift: %"IVdf" End shift %"IVdf
 885     " Real end Shift: %"IVdf"\n",
 886     (IV)(rx_origin - strpos),
 887     (IV)prog->check_offset_min,
 888     (IV)start_shift,
 889     (IV)end_shift,
 890     (IV)prog->check_end_shift);
 891   });
 892
 893   if (prog->intflags & PREGf_CANY_SEEN) {
 894    start_point= (U8*)(rx_origin + start_shift);
 895    end_point= (U8*)(strend - end_shift);
 896    if (start_point > end_point)
 897     goto fail_finish;
 898   } else {
 899    end_point = HOP3(strend, -end_shift, strbeg);
 900    start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 901    if (!start_point)
 902     goto fail_finish;
 903   }
 904
 905
 906   /* If the regex is absolutely anchored to either the start of the
 907   * string (BOL,SBOL) or to pos() (ANCH_GPOS), then
 908   * check_offset_max represents an upper bound on the string where
 909   * the substr could start. For the ANCH_GPOS case, we assume that
 910   * the caller of intuit will have already set strpos to
 911   * pos()-gofs, so in this case strpos + offset_max will still be
 912   * an upper bound on the substr.
 913   */
 914   if (!ml_anch
 915    && prog->intflags & PREGf_ANCH
 916    && prog->check_offset_max != SSize_t_MAX)
 917   {
 918    SSize_t len = SvCUR(check) - !!SvTAIL(check);
 919    const char * const anchor =
 920       (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 921
 922    /* do a bytes rather than chars comparison. It's conservative;
 923    * so it skips doing the HOP if the result can't possibly end
 924    * up earlier than the old value of end_point.
 925    */
 926    if ((char*)end_point - anchor > prog->check_offset_max) {
 927     end_point = HOP3lim((U8*)anchor,
 928         prog->check_offset_max,
 929         end_point -len)
 930        + len;
 931    }
 932   }
 933
 934   DEBUG_OPTIMISE_MORE_r({
 935    PerlIO_printf(Perl_debug_log, "  fbm_instr len=%d str=<%.*s>\n",
 936     (int)(end_point - start_point),
 937     (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 938     start_point);
 939   });
 940
 941   check_at = fbm_instr( start_point, end_point,
 942      check, multiline ? FBMrf_MULTILINE : 0);
 943
 944   /* Update the count-of-usability, remove useless subpatterns,
 945    unshift s.  */
 946
 947   DEBUG_EXECUTE_r({
 948    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 949     SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 950    PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s%s",
 951        (check_at ? "Found" : "Did not find"),
 952     (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 953      ? "anchored" : "floating"),
 954     quoted,
 955     RE_SV_TAIL(check),
 956     (check_at ? " at offset " : "...\n") );
 957   });
 958
 959   if (!check_at)
 960    goto fail_finish;
 961   /* Finish the diagnostic message */
 962   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
 963
 964   /* set rx_origin to the minimum position where the regex could start
 965   * matching, given the constraint of the just-matched check substring.
 966   * But don't set it lower than previously.
 967   */
 968
 969   if (check_at - rx_origin > prog->check_offset_max)
 970    rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 971  }
 972
 973
 974  /* now look for the 'other' substring if defined */
 975
 976  if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 977      : prog->substrs->data[other_ix].substr)
 978  {
 979   /* Take into account the "other" substring. */
 980   char *last, *last1;
 981   char *s;
 982   SV* must;
 983   struct reg_substr_datum *other;
 984
 985  do_other_substr:
 986   other = &prog->substrs->data[other_ix];
 987
 988   /* if "other" is anchored:
 989   * we've previously found a floating substr starting at check_at.
 990   * This means that the regex origin must lie somewhere
 991   * between min (rx_origin): HOP3(check_at, -check_offset_max)
 992   * and max:                 HOP3(check_at, -check_offset_min)
 993   * (except that min will be >= strpos)
 994   * So the fixed  substr must lie somewhere between
 995   *  HOP3(min, anchored_offset)
 996   *  HOP3(max, anchored_offset) + SvCUR(substr)
 997   */
 998
 999   /* if "other" is floating
1000   * Calculate last1, the absolute latest point where the
1001   * floating substr could start in the string, ignoring any
1002   * constraints from the earlier fixed match. It is calculated
1003   * as follows:
1004   *
1005   * strend - prog->minlen (in chars) is the absolute latest
1006   * position within the string where the origin of the regex
1007   * could appear. The latest start point for the floating
1008   * substr is float_min_offset(*) on from the start of the
1009   * regex.  last1 simply combines thee two offsets.
1010   *
1011   * (*) You might think the latest start point should be
1012   * float_max_offset from the regex origin, and technically
1013   * you'd be correct. However, consider
1014   *    /a\d{2,4}bcd\w/
1015   * Here, float min, max are 3,5 and minlen is 7.
1016   * This can match either
1017   *    /a\d\dbcd\w/
1018   *    /a\d\d\dbcd\w/
1019   *    /a\d\d\d\dbcd\w/
1020   * In the first case, the regex matches minlen chars; in the
1021   * second, minlen+1, in the third, minlen+2.
1022   * In the first case, the floating offset is 3 (which equals
1023   * float_min), in the second, 4, and in the third, 5 (which
1024   * equals float_max). In all cases, the floating string bcd
1025   * can never start more than 4 chars from the end of the
1026   * string, which equals minlen - float_min. As the substring
1027   * starts to match more than float_min from the start of the
1028   * regex, it makes the regex match more than minlen chars,
1029   * and the two cancel each other out. So we can always use
1030   * float_min - minlen, rather than float_max - minlen for the
1031   * latest position in the string.
1032   *
1033   * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1034   * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1035   */
1036
1037   assert(prog->minlen >= other->min_offset);
1038   last1 = HOP3c(strend,
1039       other->min_offset - prog->minlen, strbeg);
1040
1041   if (other_ix) {/* i.e. if (other-is-float) */
1042    /* last is the latest point where the floating substr could
1043    * start, *given* any constraints from the earlier fixed
1044    * match. This constraint is that the floating string starts
1045    * <= float_max_offset chars from the regex origin (rx_origin).
1046    * If this value is less than last1, use it instead.
1047    */
1048    assert(rx_origin <= last1);
1049    last =
1050     /* this condition handles the offset==infinity case, and
1051     * is a short-cut otherwise. Although it's comparing a
1052     * byte offset to a char length, it does so in a safe way,
1053     * since 1 char always occupies 1 or more bytes,
1054     * so if a string range is  (last1 - rx_origin) bytes,
1055     * it will be less than or equal to  (last1 - rx_origin)
1056     * chars; meaning it errs towards doing the accurate HOP3
1057     * rather than just using last1 as a short-cut */
1058     (last1 - rx_origin) < other->max_offset
1059      ? last1
1060      : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1061   }
1062   else {
1063    assert(strpos + start_shift <= check_at);
1064    last = HOP4c(check_at, other->min_offset - start_shift,
1065       strbeg, strend);
1066   }
1067
1068   s = HOP3c(rx_origin, other->min_offset, strend);
1069   if (s < other_last) /* These positions already checked */
1070    s = other_last;
1071
1072   must = utf8_target ? other->utf8_substr : other->substr;
1073   assert(SvPOK(must));
1074   s = fbm_instr(
1075    (unsigned char*)s,
1076    (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1077    must,
1078    multiline ? FBMrf_MULTILINE : 0
1079   );
1080   DEBUG_EXECUTE_r({
1081    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1082     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1083    PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s",
1084     s ? "Found" : "Contradicts",
1085     other_ix ? "floating" : "anchored",
1086     quoted, RE_SV_TAIL(must));
1087   });
1088
1089
1090   if (!s) {
1091    /* last1 is latest possible substr location. If we didn't
1092    * find it before there, we never will */
1093    if (last >= last1) {
1094     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1095           ", giving up...\n"));
1096     goto fail_finish;
1097    }
1098
1099    /* try to find the check substr again at a later
1100    * position. Maybe next time we'll find the "other" substr
1101    * in range too */
1102    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1103     ", trying %s at offset %ld...\n",
1104     (other_ix ? "floating" : "anchored"),
1105     (long)(HOP3c(check_at, 1, strend) - strpos)));
1106
1107    other_last = HOP3c(last, 1, strend) /* highest failure */;
1108    rx_origin =
1109     other_ix /* i.e. if other-is-float */
1110      ? HOP3c(rx_origin, 1, strend)
1111      : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1112    goto restart;
1113   }
1114   else {
1115    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
1116     (long)(s - strpos)));
1117
1118    if (other_ix) { /* if (other-is-float) */
1119     /* other_last is set to s, not s+1, since its possible for
1120     * a floating substr to fail first time, then succeed
1121     * second time at the same floating position; e.g.:
1122     *     "-AB--AABZ" =~ /\wAB\d*Z/
1123     * The first time round, anchored and float match at
1124     * "-(AB)--AAB(Z)" then fail on the initial \w character
1125     * class. Second time round, they match at "-AB--A(AB)(Z)".
1126     */
1127     other_last = s;
1128    }
1129    else {
1130     rx_origin = HOP3c(s, -other->min_offset, strbeg);
1131     other_last = HOP3c(s, 1, strend);
1132    }
1133   }
1134  }
1135  else {
1136   DEBUG_OPTIMISE_MORE_r(
1137    PerlIO_printf(Perl_debug_log,
1138     "  Check-only match: offset min:%"IVdf" max:%"IVdf
1139     " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1140     " strend-strpos:%"IVdf"\n",
1141     (IV)prog->check_offset_min,
1142     (IV)prog->check_offset_max,
1143     (IV)(check_at-strpos),
1144     (IV)(rx_origin-strpos),
1145     (IV)(rx_origin-check_at),
1146     (IV)(strend-strpos)
1147    )
1148   );
1149  }
1150
1151   postprocess_substr_matches:
1152
1153  /* handle the extra constraint of /^.../m if present */
1154
1155  if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1156   char *s;
1157
1158   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1159       "  looking for /^/m anchor"));
1160
1161   /* we have failed the constraint of a \n before rx_origin.
1162   * Find the next \n, if any, even if it's beyond the current
1163   * anchored and/or floating substrings. Whether we should be
1164   * scanning ahead for the next \n or the next substr is debatable.
1165   * On the one hand you'd expect rare substrings to appear less
1166   * often than \n's. On the other hand, searching for \n means
1167   * we're effectively flipping been check_substr and "\n" on each
1168   * iteration as the current "rarest" string candidate, which
1169   * means for example that we'll quickly reject the whole string if
1170   * hasn't got a \n, rather than trying every substr position
1171   * first
1172   */
1173
1174   s = HOP3c(strend, - prog->minlen, strpos);
1175   if (s <= rx_origin ||
1176    ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1177   {
1178    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1179        "  Did not find /%s^%s/m...\n",
1180        PL_colors[0], PL_colors[1]));
1181    goto fail_finish;
1182   }
1183
1184   /* earliest possible origin is 1 char after the \n.
1185   * (since *rx_origin == '\n', it's safe to ++ here rather than
1186   * HOP(rx_origin, 1)) */
1187   rx_origin++;
1188
1189   if (prog->substrs->check_ix == 0  /* check is anchored */
1190    || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1191   {
1192    /* Position contradicts check-string; either because
1193    * check was anchored (and thus has no wiggle room),
1194    * or check was float and rx_origin is above the float range */
1195    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1196     "  Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1197     PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1198    goto restart;
1199   }
1200
1201   /* if we get here, the check substr must have been float,
1202   * is in range, and we may or may not have had an anchored
1203   * "other" substr which still contradicts */
1204   assert(prog->substrs->check_ix); /* check is float */
1205
1206   if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1207    /* whoops, the anchored "other" substr exists, so we still
1208    * contradict. On the other hand, the float "check" substr
1209    * didn't contradict, so just retry the anchored "other"
1210    * substr */
1211    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1212     "  Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1213     PL_colors[0], PL_colors[1],
1214     (long)(rx_origin - strpos),
1215     (long)(rx_origin - strpos + prog->anchored_offset)));
1216    goto do_other_substr;
1217   }
1218
1219   /* success: we don't contradict the found floating substring
1220   * (and there's no anchored substr). */
1221   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1222    "  Found /%s^%s/m at offset %ld...\n",
1223    PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1224  }
1225  else {
1226   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1227    "  (multiline anchor test skipped)\n"));
1228  }
1229
1230   success_at_start:
1231
1232
1233  /* if we have a starting character class, then test that extra constraint.
1234  * (trie stclasses are too expensive to use here, we are better off to
1235  * leave it to regmatch itself) */
1236
1237  if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1238   const U8* const str = (U8*)STRING(progi->regstclass);
1239
1240   /* XXX this value could be pre-computed */
1241   const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1242      ?  (reginfo->is_utf8_pat
1243       ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1244       : STR_LEN(progi->regstclass))
1245      : 1);
1246   char * endpos;
1247   char *s;
1248   /* latest pos that a matching float substr constrains rx start to */
1249   char *rx_max_float = NULL;
1250
1251   /* if the current rx_origin is anchored, either by satisfying an
1252   * anchored substring constraint, or a /^.../m constraint, then we
1253   * can reject the current origin if the start class isn't found
1254   * at the current position. If we have a float-only match, then
1255   * rx_origin is constrained to a range; so look for the start class
1256   * in that range. if neither, then look for the start class in the
1257   * whole rest of the string */
1258
1259   /* XXX DAPM it's not clear what the minlen test is for, and why
1260   * it's not used in the floating case. Nothing in the test suite
1261   * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1262   * Here are some old comments, which may or may not be correct:
1263   *
1264   *   minlen == 0 is possible if regstclass is \b or \B,
1265   *   and the fixed substr is ''$.
1266   *   Since minlen is already taken into account, rx_origin+1 is
1267   *   before strend; accidentally, minlen >= 1 guaranties no false
1268   *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1269   *   0) below assumes that regstclass does not come from lookahead...
1270   *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1271   *   This leaves EXACTF-ish only, which are dealt with in
1272   *   find_byclass().
1273   */
1274
1275   if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1276    endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1277   else if (prog->float_substr || prog->float_utf8) {
1278    rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1279    endpos= HOP3c(rx_max_float, cl_l, strend);
1280   }
1281   else
1282    endpos= strend;
1283
1284   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1285    "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1286    " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1287    (IV)start_shift, (IV)(check_at - strbeg),
1288    (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1289
1290   s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1291        reginfo);
1292   if (!s) {
1293    if (endpos == strend) {
1294     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1295         "  Could not match STCLASS...\n") );
1296     goto fail;
1297    }
1298    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1299        "  This position contradicts STCLASS...\n") );
1300    if ((prog->intflags & PREGf_ANCH) && !ml_anch
1301       && !(prog->intflags & PREGf_IMPLICIT))
1302     goto fail;
1303
1304    /* Contradict one of substrings */
1305    if (prog->anchored_substr || prog->anchored_utf8) {
1306     if (prog->substrs->check_ix == 1) { /* check is float */
1307      /* Have both, check_string is floating */
1308      assert(rx_origin + start_shift <= check_at);
1309      if (rx_origin + start_shift != check_at) {
1310       /* not at latest position float substr could match:
1311       * Recheck anchored substring, but not floating.
1312       * The condition above is in bytes rather than
1313       * chars for efficiency. It's conservative, in
1314       * that it errs on the side of doing 'goto
1315       * do_other_substr', where a more accurate
1316       * char-based calculation will be done */
1317       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1318         "  Looking for anchored substr starting at offset %ld...\n",
1319         (long)(other_last - strpos)) );
1320       goto do_other_substr;
1321      }
1322     }
1323    }
1324    else {
1325     /* float-only */
1326
1327     if (ml_anch) {
1328      /* In the presence of ml_anch, we might be able to
1329      * find another \n without breaking the current float
1330      * constraint. */
1331
1332      /* strictly speaking this should be HOP3c(..., 1, ...),
1333      * but since we goto a block of code that's going to
1334      * search for the next \n if any, its safe here */
1335      rx_origin++;
1336      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1337        "  Looking for /%s^%s/m starting at offset %ld...\n",
1338        PL_colors[0], PL_colors[1],
1339        (long)(rx_origin - strpos)) );
1340      goto postprocess_substr_matches;
1341     }
1342
1343     /* strictly speaking this can never be true; but might
1344     * be if we ever allow intuit without substrings */
1345     if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1346      goto fail;
1347
1348     rx_origin = rx_max_float;
1349    }
1350
1351    /* at this point, any matching substrings have been
1352    * contradicted. Start again... */
1353
1354    rx_origin = HOP3c(rx_origin, 1, strend);
1355
1356    /* uses bytes rather than char calculations for efficiency.
1357    * It's conservative: it errs on the side of doing 'goto restart',
1358    * where there is code that does a proper char-based test */
1359    if (rx_origin + start_shift + end_shift > strend) {
1360     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1361          "  Could not match STCLASS...\n") );
1362     goto fail;
1363    }
1364    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1365     "  Looking for %s substr starting at offset %ld...\n",
1366     (prog->substrs->check_ix ? "floating" : "anchored"),
1367     (long)(rx_origin + start_shift - strpos)) );
1368    goto restart;
1369   }
1370
1371   /* Success !!! */
1372
1373   if (rx_origin != s) {
1374    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1375       "  By STCLASS: moving %ld --> %ld\n",
1376         (long)(rx_origin - strpos), (long)(s - strpos))
1377     );
1378   }
1379   else {
1380    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1381         "  Does not contradict STCLASS...\n");
1382     );
1383   }
1384  }
1385
1386  /* Decide whether using the substrings helped */
1387
1388  if (rx_origin != strpos) {
1389   /* Fixed substring is found far enough so that the match
1390   cannot start at strpos. */
1391
1392   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  try at offset...\n"));
1393   ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1394  }
1395  else {
1396   /* The found rx_origin position does not prohibit matching at
1397   * strpos, so calling intuit didn't gain us anything. Decrement
1398   * the BmUSEFUL() count on the check substring, and if we reach
1399   * zero, free it.  */
1400   if (!(prog->intflags & PREGf_NAUGHTY)
1401    && (utf8_target ? (
1402     prog->check_utf8  /* Could be deleted already */
1403     && --BmUSEFUL(prog->check_utf8) < 0
1404     && (prog->check_utf8 == prog->float_utf8)
1405    ) : (
1406     prog->check_substr  /* Could be deleted already */
1407     && --BmUSEFUL(prog->check_substr) < 0
1408     && (prog->check_substr == prog->float_substr)
1409    )))
1410   {
1411    /* If flags & SOMETHING - do not do it many times on the same match */
1412    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  ... Disabling check substring...\n"));
1413    /* XXX Does the destruction order has to change with utf8_target? */
1414    SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1415    SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1416    prog->check_substr = prog->check_utf8 = NULL; /* disable */
1417    prog->float_substr = prog->float_utf8 = NULL; /* clear */
1418    check = NULL;   /* abort */
1419    /* XXXX This is a remnant of the old implementation.  It
1420      looks wasteful, since now INTUIT can use many
1421      other heuristics. */
1422    prog->extflags &= ~RXf_USE_INTUIT;
1423   }
1424  }
1425
1426  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1427    "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1428    PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1429
1430  return rx_origin;
1431
1432   fail_finish:    /* Substring not found */
1433  if (prog->check_substr || prog->check_utf8)  /* could be removed already */
1434   BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1435   fail:
1436  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1437       PL_colors[4], PL_colors[5]));
1438  return NULL;
1439 }
1440
1441
1442 #define DECL_TRIE_TYPE(scan) \
1443  const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1444     trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
1445      trie_type = ((scan->flags == EXACT) \
1446        ? (utf8_target ? trie_utf8 : trie_plain) \
1447        : (scan->flags == EXACTFA) \
1448         ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1449         : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1450
1451 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1452 STMT_START {                                                                        \
1453  STRLEN skiplen;                                                                 \
1454  U8 flags = FOLD_FLAGS_FULL;                                                     \
1455  switch (trie_type) {                                                            \
1456  case trie_utf8_exactfa_fold:                                                    \
1457   flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1458   /* FALL THROUGH */                                                          \
1459  case trie_utf8_fold:                                                            \
1460   if ( foldlen>0 ) {                                                          \
1461    uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1462    foldlen -= len;                                                         \
1463    uscan += len;                                                           \
1464    len=0;                                                                  \
1465   } else {                                                                    \
1466    uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1467    len = UTF8SKIP(uc);                                                     \
1468    skiplen = UNISKIP( uvc );                                               \
1469    foldlen -= skiplen;                                                     \
1470    uscan = foldbuf + skiplen;                                              \
1471   }                                                                           \
1472   break;                                                                      \
1473  case trie_latin_utf8_exactfa_fold:                                              \
1474   flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1475   /* FALL THROUGH */                                                          \
1476  case trie_latin_utf8_fold:                                                      \
1477   if ( foldlen>0 ) {                                                          \
1478    uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1479    foldlen -= len;                                                         \
1480    uscan += len;                                                           \
1481    len=0;                                                                  \
1482   } else {                                                                    \
1483    len = 1;                                                                \
1484    uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1485    skiplen = UNISKIP( uvc );                                               \
1486    foldlen -= skiplen;                                                     \
1487    uscan = foldbuf + skiplen;                                              \
1488   }                                                                           \
1489   break;                                                                      \
1490  case trie_utf8:                                                                 \
1491   uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1492   break;                                                                      \
1493  case trie_plain:                                                                \
1494   uvc = (UV)*uc;                                                              \
1495   len = 1;                                                                    \
1496  }                                                                               \
1497  if (uvc < 256) {                                                                \
1498   charid = trie->charmap[ uvc ];                                              \
1499  }                                                                               \
1500  else {                                                                          \
1501   charid = 0;                                                                 \
1502   if (widecharmap) {                                                          \
1503    SV** const svpp = hv_fetch(widecharmap,                                 \
1504       (char*)&uvc, sizeof(UV), 0);                                \
1505    if (svpp)                                                               \
1506     charid = (U16)SvIV(*svpp);                                          \
1507   }                                                                           \
1508  }                                                                               \
1509 } STMT_END
1510
1511 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1512 STMT_START {                                              \
1513  while (s <= e) {                                      \
1514   if ( (CoNd)                                       \
1515    && (ln == 1 || folder(s, pat_string, ln))    \
1516    && (reginfo->intuit || regtry(reginfo, &s)) )\
1517    goto got_it;                                  \
1518   s++;                                              \
1519  }                                                     \
1520 } STMT_END
1521
1522 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1523 STMT_START {                                          \
1524  while (s < strend) {                              \
1525   CoDe                                          \
1526   s += UTF8SKIP(s);                             \
1527  }                                                 \
1528 } STMT_END
1529
1530 #define REXEC_FBC_SCAN(CoDe)                          \
1531 STMT_START {                                          \
1532  while (s < strend) {                              \
1533   CoDe                                          \
1534   s++;                                          \
1535  }                                                 \
1536 } STMT_END
1537
1538 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1539 REXEC_FBC_UTF8_SCAN(                                  \
1540  if (CoNd) {                                       \
1541   if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1542    goto got_it;                              \
1543   else                                          \
1544    tmp = doevery;                            \
1545  }                                                 \
1546  else                                              \
1547   tmp = 1;                                      \
1548 )
1549
1550 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1551 REXEC_FBC_SCAN(                                       \
1552  if (CoNd) {                                       \
1553   if (tmp && (reginfo->intuit || regtry(reginfo, &s)))  \
1554    goto got_it;                              \
1555   else                                          \
1556    tmp = doevery;                            \
1557  }                                                 \
1558  else                                              \
1559   tmp = 1;                                      \
1560 )
1561
1562 #define REXEC_FBC_TRYIT                       \
1563 if ((reginfo->intuit || regtry(reginfo, &s))) \
1564  goto got_it
1565
1566 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1567  if (utf8_target) {                                         \
1568   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1569  }                                                          \
1570  else {                                                     \
1571   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1572  }
1573
1574 #define DUMP_EXEC_POS(li,s,doutf8)                          \
1575  dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1576     startpos, doutf8)
1577
1578
1579 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                        \
1580   tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1581   tmp = TEST_NON_UTF8(tmp);                                              \
1582   REXEC_FBC_UTF8_SCAN(                                                   \
1583    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1584     tmp = !tmp;                                                    \
1585     IF_SUCCESS;                                                    \
1586    }                                                                  \
1587    else {                                                             \
1588     IF_FAIL;                                                       \
1589    }                                                                  \
1590   );                                                                     \
1591
1592 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL)                 \
1593   if (s == reginfo->strbeg) {                                            \
1594    tmp = '\n';                                                        \
1595   }                                                                      \
1596   else {                                                                 \
1597    U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);          \
1598    tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                 \
1599              0, UTF8_ALLOW_DEFAULT); \
1600   }                                                                      \
1601   tmp = TeSt1_UtF8;                                                      \
1602   LOAD_UTF8_CHARCLASS_ALNUM();                                           \
1603   REXEC_FBC_UTF8_SCAN(                                                   \
1604    if (tmp == ! (TeSt2_UtF8)) {                                       \
1605     tmp = !tmp;                                                    \
1606     IF_SUCCESS;                                                    \
1607    }                                                                  \
1608    else {                                                             \
1609     IF_FAIL;                                                       \
1610    }                                                                  \
1611   );                                                                     \
1612
1613 /* The only difference between the BOUND and NBOUND cases is that
1614  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1615  * NBOUND.  This is accomplished by passing it in either the if or else clause,
1616  * with the other one being empty */
1617 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1618  FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1619
1620 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1621  FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1622
1623 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1624  FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1625
1626 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1627  FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1628
1629
1630 /* Common to the BOUND and NBOUND cases.  Unfortunately the UTF8 tests need to
1631  * be passed in completely with the variable name being tested, which isn't
1632  * such a clean interface, but this is easier to read than it was before.  We
1633  * are looking for the boundary (or non-boundary between a word and non-word
1634  * character.  The utf8 and non-utf8 cases have the same logic, but the details
1635  * must be different.  Find the "wordness" of the character just prior to this
1636  * one, and compare it with the wordness of this one.  If they differ, we have
1637  * a boundary.  At the beginning of the string, pretend that the previous
1638  * character was a new-line */
1639 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1640  if (utf8_target) {                                                         \
1641     UTF8_CODE                                                      \
1642  }                                                                          \
1643  else {  /* Not utf8 */                                                     \
1644   tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1645   tmp = TEST_NON_UTF8(tmp);                                              \
1646   REXEC_FBC_SCAN(                                                        \
1647    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1648     tmp = !tmp;                                                    \
1649     IF_SUCCESS;                                                    \
1650    }                                                                  \
1651    else {                                                             \
1652     IF_FAIL;                                                       \
1653    }                                                                  \
1654   );                                                                     \
1655  }                                                                          \
1656  if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s)))    \
1657   goto got_it;
1658
1659 /* We know what class REx starts with.  Try to find this position... */
1660 /* if reginfo->intuit, its a dryrun */
1661 /* annoyingly all the vars in this routine have different names from their counterparts
1662    in regmatch. /grrr */
1663
1664 STATIC char *
1665 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1666  const char *strend, regmatch_info *reginfo)
1667 {
1668  dVAR;
1669  const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1670  char *pat_string;   /* The pattern's exactish string */
1671  char *pat_end;     /* ptr to end char of pat_string */
1672  re_fold_t folder; /* Function for computing non-utf8 folds */
1673  const U8 *fold_array;   /* array for folding ords < 256 */
1674  STRLEN ln;
1675  STRLEN lnc;
1676  U8 c1;
1677  U8 c2;
1678  char *e;
1679  I32 tmp = 1; /* Scratch variable? */
1680  const bool utf8_target = reginfo->is_utf8_target;
1681  UV utf8_fold_flags = 0;
1682  const bool is_utf8_pat = reginfo->is_utf8_pat;
1683  bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1684         with a result inverts that result, as 0^1 =
1685         1 and 1^1 = 0 */
1686  _char_class_number classnum;
1687
1688  RXi_GET_DECL(prog,progi);
1689
1690  PERL_ARGS_ASSERT_FIND_BYCLASS;
1691
1692  /* We know what class it must start with. */
1693  switch (OP(c)) {
1694  case ANYOF:
1695   if (utf8_target) {
1696    REXEC_FBC_UTF8_CLASS_SCAN(
1697      reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1698   }
1699   else {
1700    REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1701   }
1702   break;
1703  case CANY:
1704   REXEC_FBC_SCAN(
1705    if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1706     goto got_it;
1707    else
1708     tmp = doevery;
1709   );
1710   break;
1711
1712  case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1713   assert(! is_utf8_pat);
1714   /* FALL THROUGH */
1715  case EXACTFA:
1716   if (is_utf8_pat || utf8_target) {
1717    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1718    goto do_exactf_utf8;
1719   }
1720   fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1721   folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1722   goto do_exactf_non_utf8; /* isn't dealt with by these */
1723
1724  case EXACTF:   /* This node only generated for non-utf8 patterns */
1725   assert(! is_utf8_pat);
1726   if (utf8_target) {
1727    utf8_fold_flags = 0;
1728    goto do_exactf_utf8;
1729   }
1730   fold_array = PL_fold;
1731   folder = foldEQ;
1732   goto do_exactf_non_utf8;
1733
1734  case EXACTFL:
1735   if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1736    utf8_fold_flags = FOLDEQ_LOCALE;
1737    goto do_exactf_utf8;
1738   }
1739   fold_array = PL_fold_locale;
1740   folder = foldEQ_locale;
1741   goto do_exactf_non_utf8;
1742
1743  case EXACTFU_SS:
1744   if (is_utf8_pat) {
1745    utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1746   }
1747   goto do_exactf_utf8;
1748
1749  case EXACTFU:
1750   if (is_utf8_pat || utf8_target) {
1751    utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1752    goto do_exactf_utf8;
1753   }
1754
1755   /* Any 'ss' in the pattern should have been replaced by regcomp,
1756   * so we don't have to worry here about this single special case
1757   * in the Latin1 range */
1758   fold_array = PL_fold_latin1;
1759   folder = foldEQ_latin1;
1760
1761   /* FALL THROUGH */
1762
1763  do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1764       are no glitches with fold-length differences
1765       between the target string and pattern */
1766
1767   /* The idea in the non-utf8 EXACTF* cases is to first find the
1768   * first character of the EXACTF* node and then, if necessary,
1769   * case-insensitively compare the full text of the node.  c1 is the
1770   * first character.  c2 is its fold.  This logic will not work for
1771   * Unicode semantics and the german sharp ss, which hence should
1772   * not be compiled into a node that gets here. */
1773   pat_string = STRING(c);
1774   ln  = STR_LEN(c); /* length to match in octets/bytes */
1775
1776   /* We know that we have to match at least 'ln' bytes (which is the
1777   * same as characters, since not utf8).  If we have to match 3
1778   * characters, and there are only 2 availabe, we know without
1779   * trying that it will fail; so don't start a match past the
1780   * required minimum number from the far end */
1781   e = HOP3c(strend, -((SSize_t)ln), s);
1782
1783   if (reginfo->intuit && e < s) {
1784    e = s;   /* Due to minlen logic of intuit() */
1785   }
1786
1787   c1 = *pat_string;
1788   c2 = fold_array[c1];
1789   if (c1 == c2) { /* If char and fold are the same */
1790    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1791   }
1792   else {
1793    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1794   }
1795   break;
1796
1797  do_exactf_utf8:
1798  {
1799   unsigned expansion;
1800
1801   /* If one of the operands is in utf8, we can't use the simpler folding
1802   * above, due to the fact that many different characters can have the
1803   * same fold, or portion of a fold, or different- length fold */
1804   pat_string = STRING(c);
1805   ln  = STR_LEN(c); /* length to match in octets/bytes */
1806   pat_end = pat_string + ln;
1807   lnc = is_utf8_pat       /* length to match in characters */
1808     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1809     : ln;
1810
1811   /* We have 'lnc' characters to match in the pattern, but because of
1812   * multi-character folding, each character in the target can match
1813   * up to 3 characters (Unicode guarantees it will never exceed
1814   * this) if it is utf8-encoded; and up to 2 if not (based on the
1815   * fact that the Latin 1 folds are already determined, and the
1816   * only multi-char fold in that range is the sharp-s folding to
1817   * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1818   * string character.  Adjust lnc accordingly, rounding up, so that
1819   * if we need to match at least 4+1/3 chars, that really is 5. */
1820   expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1821   lnc = (lnc + expansion - 1) / expansion;
1822
1823   /* As in the non-UTF8 case, if we have to match 3 characters, and
1824   * only 2 are left, it's guaranteed to fail, so don't start a
1825   * match that would require us to go beyond the end of the string
1826   */
1827   e = HOP3c(strend, -((SSize_t)lnc), s);
1828
1829   if (reginfo->intuit && e < s) {
1830    e = s;   /* Due to minlen logic of intuit() */
1831   }
1832
1833   /* XXX Note that we could recalculate e to stop the loop earlier,
1834   * as the worst case expansion above will rarely be met, and as we
1835   * go along we would usually find that e moves further to the left.
1836   * This would happen only after we reached the point in the loop
1837   * where if there were no expansion we should fail.  Unclear if
1838   * worth the expense */
1839
1840   while (s <= e) {
1841    char *my_strend= (char *)strend;
1842    if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1843     pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1844     && (reginfo->intuit || regtry(reginfo, &s)) )
1845    {
1846     goto got_it;
1847    }
1848    s += (utf8_target) ? UTF8SKIP(s) : 1;
1849   }
1850   break;
1851  }
1852  case BOUNDL:
1853   FBC_BOUND(isWORDCHAR_LC,
1854     isWORDCHAR_LC_uvchr(tmp),
1855     isWORDCHAR_LC_utf8((U8*)s));
1856   break;
1857  case NBOUNDL:
1858   FBC_NBOUND(isWORDCHAR_LC,
1859     isWORDCHAR_LC_uvchr(tmp),
1860     isWORDCHAR_LC_utf8((U8*)s));
1861   break;
1862  case BOUND:
1863   FBC_BOUND(isWORDCHAR,
1864     isWORDCHAR_uni(tmp),
1865     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1866   break;
1867  case BOUNDA:
1868   FBC_BOUND_NOLOAD(isWORDCHAR_A,
1869       isWORDCHAR_A(tmp),
1870       isWORDCHAR_A((U8*)s));
1871   break;
1872  case NBOUND:
1873   FBC_NBOUND(isWORDCHAR,
1874     isWORDCHAR_uni(tmp),
1875     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1876   break;
1877  case NBOUNDA:
1878   FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1879       isWORDCHAR_A(tmp),
1880       isWORDCHAR_A((U8*)s));
1881   break;
1882  case BOUNDU:
1883   FBC_BOUND(isWORDCHAR_L1,
1884     isWORDCHAR_uni(tmp),
1885     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1886   break;
1887  case NBOUNDU:
1888   FBC_NBOUND(isWORDCHAR_L1,
1889     isWORDCHAR_uni(tmp),
1890     cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1891   break;
1892  case LNBREAK:
1893   REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1894       is_LNBREAK_latin1_safe(s, strend)
1895   );
1896   break;
1897
1898  /* The argument to all the POSIX node types is the class number to pass to
1899  * _generic_isCC() to build a mask for searching in PL_charclass[] */
1900
1901  case NPOSIXL:
1902   to_complement = 1;
1903   /* FALLTHROUGH */
1904
1905  case POSIXL:
1906   REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1907       to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1908   break;
1909
1910  case NPOSIXD:
1911   to_complement = 1;
1912   /* FALLTHROUGH */
1913
1914  case POSIXD:
1915   if (utf8_target) {
1916    goto posix_utf8;
1917   }
1918   goto posixa;
1919
1920  case NPOSIXA:
1921   if (utf8_target) {
1922    /* The complement of something that matches only ASCII matches all
1923    * UTF-8 variant code points, plus everything in ASCII that isn't
1924    * in the class */
1925    REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1926          || ! _generic_isCC_A(*s, FLAGS(c)));
1927    break;
1928   }
1929
1930   to_complement = 1;
1931   /* FALLTHROUGH */
1932
1933  case POSIXA:
1934  posixa:
1935   /* Don't need to worry about utf8, as it can match only a single
1936   * byte invariant character. */
1937   REXEC_FBC_CLASS_SCAN(
1938       to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1939   break;
1940
1941  case NPOSIXU:
1942   to_complement = 1;
1943   /* FALLTHROUGH */
1944
1945  case POSIXU:
1946   if (! utf8_target) {
1947    REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1948                  FLAGS(c))));
1949   }
1950   else {
1951
1952  posix_utf8:
1953    classnum = (_char_class_number) FLAGS(c);
1954    if (classnum < _FIRST_NON_SWASH_CC) {
1955     while (s < strend) {
1956
1957      /* We avoid loading in the swash as long as possible, but
1958      * should we have to, we jump to a separate loop.  This
1959      * extra 'if' statement is what keeps this code from being
1960      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1961      if (UTF8_IS_ABOVE_LATIN1(*s)) {
1962       goto found_above_latin1;
1963      }
1964      if ((UTF8_IS_INVARIANT(*s)
1965       && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1966                 classnum)))
1967       || (UTF8_IS_DOWNGRADEABLE_START(*s)
1968        && to_complement ^ cBOOL(
1969         _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1970                  *(s + 1)),
1971            classnum))))
1972      {
1973       if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1974        goto got_it;
1975       else {
1976        tmp = doevery;
1977       }
1978      }
1979      else {
1980       tmp = 1;
1981      }
1982      s += UTF8SKIP(s);
1983     }
1984    }
1985    else switch (classnum) {    /* These classes are implemented as
1986           macros */
1987     case _CC_ENUM_SPACE: /* XXX would require separate code if we
1988           revert the change of \v matching this */
1989      /* FALL THROUGH */
1990
1991     case _CC_ENUM_PSXSPC:
1992      REXEC_FBC_UTF8_CLASS_SCAN(
1993           to_complement ^ cBOOL(isSPACE_utf8(s)));
1994      break;
1995
1996     case _CC_ENUM_BLANK:
1997      REXEC_FBC_UTF8_CLASS_SCAN(
1998           to_complement ^ cBOOL(isBLANK_utf8(s)));
1999      break;
2000
2001     case _CC_ENUM_XDIGIT:
2002      REXEC_FBC_UTF8_CLASS_SCAN(
2003          to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2004      break;
2005
2006     case _CC_ENUM_VERTSPACE:
2007      REXEC_FBC_UTF8_CLASS_SCAN(
2008          to_complement ^ cBOOL(isVERTWS_utf8(s)));
2009      break;
2010
2011     case _CC_ENUM_CNTRL:
2012      REXEC_FBC_UTF8_CLASS_SCAN(
2013           to_complement ^ cBOOL(isCNTRL_utf8(s)));
2014      break;
2015
2016     default:
2017      Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2018      assert(0); /* NOTREACHED */
2019    }
2020   }
2021   break;
2022
2023  found_above_latin1:   /* Here we have to load a swash to get the result
2024        for the current code point */
2025   if (! PL_utf8_swash_ptrs[classnum]) {
2026    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2027    PL_utf8_swash_ptrs[classnum] =
2028      _core_swash_init("utf8",
2029          "",
2030          &PL_sv_undef, 1, 0,
2031          PL_XPosix_ptrs[classnum], &flags);
2032   }
2033
2034   /* This is a copy of the loop above for swash classes, though using the
2035   * FBC macro instead of being expanded out.  Since we've loaded the
2036   * swash, we don't have to check for that each time through the loop */
2037   REXEC_FBC_UTF8_CLASS_SCAN(
2038     to_complement ^ cBOOL(_generic_utf8(
2039          classnum,
2040          s,
2041          swash_fetch(PL_utf8_swash_ptrs[classnum],
2042             (U8 *) s, TRUE))));
2043   break;
2044
2045  case AHOCORASICKC:
2046  case AHOCORASICK:
2047   {
2048    DECL_TRIE_TYPE(c);
2049    /* what trie are we using right now */
2050    reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2051    reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2052    HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2053
2054    const char *last_start = strend - trie->minlen;
2055 #ifdef DEBUGGING
2056    const char *real_start = s;
2057 #endif
2058    STRLEN maxlen = trie->maxlen;
2059    SV *sv_points;
2060    U8 **points; /* map of where we were in the input string
2061        when reading a given char. For ASCII this
2062        is unnecessary overhead as the relationship
2063        is always 1:1, but for Unicode, especially
2064        case folded Unicode this is not true. */
2065    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2066    U8 *bitmap=NULL;
2067
2068
2069    GET_RE_DEBUG_FLAGS_DECL;
2070
2071    /* We can't just allocate points here. We need to wrap it in
2072    * an SV so it gets freed properly if there is a croak while
2073    * running the match */
2074    ENTER;
2075    SAVETMPS;
2076    sv_points=newSV(maxlen * sizeof(U8 *));
2077    SvCUR_set(sv_points,
2078     maxlen * sizeof(U8 *));
2079    SvPOK_on(sv_points);
2080    sv_2mortal(sv_points);
2081    points=(U8**)SvPV_nolen(sv_points );
2082    if ( trie_type != trie_utf8_fold
2083     && (trie->bitmap || OP(c)==AHOCORASICKC) )
2084    {
2085     if (trie->bitmap)
2086      bitmap=(U8*)trie->bitmap;
2087     else
2088      bitmap=(U8*)ANYOF_BITMAP(c);
2089    }
2090    /* this is the Aho-Corasick algorithm modified a touch
2091    to include special handling for long "unknown char" sequences.
2092    The basic idea being that we use AC as long as we are dealing
2093    with a possible matching char, when we encounter an unknown char
2094    (and we have not encountered an accepting state) we scan forward
2095    until we find a legal starting char.
2096    AC matching is basically that of trie matching, except that when
2097    we encounter a failing transition, we fall back to the current
2098    states "fail state", and try the current char again, a process
2099    we repeat until we reach the root state, state 1, or a legal
2100    transition. If we fail on the root state then we can either
2101    terminate if we have reached an accepting state previously, or
2102    restart the entire process from the beginning if we have not.
2103
2104    */
2105    while (s <= last_start) {
2106     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2107     U8 *uc = (U8*)s;
2108     U16 charid = 0;
2109     U32 base = 1;
2110     U32 state = 1;
2111     UV uvc = 0;
2112     STRLEN len = 0;
2113     STRLEN foldlen = 0;
2114     U8 *uscan = (U8*)NULL;
2115     U8 *leftmost = NULL;
2116 #ifdef DEBUGGING
2117     U32 accepted_word= 0;
2118 #endif
2119     U32 pointpos = 0;
2120
2121     while ( state && uc <= (U8*)strend ) {
2122      int failed=0;
2123      U32 word = aho->states[ state ].wordnum;
2124
2125      if( state==1 ) {
2126       if ( bitmap ) {
2127        DEBUG_TRIE_EXECUTE_r(
2128         if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2129          dump_exec_pos( (char *)uc, c, strend, real_start,
2130           (char *)uc, utf8_target );
2131          PerlIO_printf( Perl_debug_log,
2132           " Scanning for legal start char...\n");
2133         }
2134        );
2135        if (utf8_target) {
2136         while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2137          uc += UTF8SKIP(uc);
2138         }
2139        } else {
2140         while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2141          uc++;
2142         }
2143        }
2144        s= (char *)uc;
2145       }
2146       if (uc >(U8*)last_start) break;
2147      }
2148
2149      if ( word ) {
2150       U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2151       if (!leftmost || lpos < leftmost) {
2152        DEBUG_r(accepted_word=word);
2153        leftmost= lpos;
2154       }
2155       if (base==0) break;
2156
2157      }
2158      points[pointpos++ % maxlen]= uc;
2159      if (foldlen || uc < (U8*)strend) {
2160       REXEC_TRIE_READ_CHAR(trie_type, trie,
2161           widecharmap, uc,
2162           uscan, len, uvc, charid, foldlen,
2163           foldbuf, uniflags);
2164       DEBUG_TRIE_EXECUTE_r({
2165        dump_exec_pos( (char *)uc, c, strend,
2166           real_start, s, utf8_target);
2167        PerlIO_printf(Perl_debug_log,
2168         " Charid:%3u CP:%4"UVxf" ",
2169         charid, uvc);
2170       });
2171      }
2172      else {
2173       len = 0;
2174       charid = 0;
2175      }
2176
2177
2178      do {
2179 #ifdef DEBUGGING
2180       word = aho->states[ state ].wordnum;
2181 #endif
2182       base = aho->states[ state ].trans.base;
2183
2184       DEBUG_TRIE_EXECUTE_r({
2185        if (failed)
2186         dump_exec_pos( (char *)uc, c, strend, real_start,
2187          s,   utf8_target );
2188        PerlIO_printf( Perl_debug_log,
2189         "%sState: %4"UVxf", word=%"UVxf,
2190         failed ? " Fail transition to " : "",
2191         (UV)state, (UV)word);
2192       });
2193       if ( base ) {
2194        U32 tmp;
2195        I32 offset;
2196        if (charid &&
2197         ( ((offset = base + charid
2198          - 1 - trie->uniquecharcount)) >= 0)
2199         && ((U32)offset < trie->lasttrans)
2200         && trie->trans[offset].check == state
2201         && (tmp=trie->trans[offset].next))
2202        {
2203         DEBUG_TRIE_EXECUTE_r(
2204          PerlIO_printf( Perl_debug_log," - legal\n"));
2205         state = tmp;
2206         break;
2207        }
2208        else {
2209         DEBUG_TRIE_EXECUTE_r(
2210          PerlIO_printf( Perl_debug_log," - fail\n"));
2211         failed = 1;
2212         state = aho->fail[state];
2213        }
2214       }
2215       else {
2216        /* we must be accepting here */
2217        DEBUG_TRIE_EXECUTE_r(
2218          PerlIO_printf( Perl_debug_log," - accepting\n"));
2219        failed = 1;
2220        break;
2221       }
2222      } while(state);
2223      uc += len;
2224      if (failed) {
2225       if (leftmost)
2226        break;
2227       if (!state) state = 1;
2228      }
2229     }
2230     if ( aho->states[ state ].wordnum ) {
2231      U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2232      if (!leftmost || lpos < leftmost) {
2233       DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2234       leftmost = lpos;
2235      }
2236     }
2237     if (leftmost) {
2238      s = (char*)leftmost;
2239      DEBUG_TRIE_EXECUTE_r({
2240       PerlIO_printf(
2241        Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2242        (UV)accepted_word, (IV)(s - real_start)
2243       );
2244      });
2245      if (reginfo->intuit || regtry(reginfo, &s)) {
2246       FREETMPS;
2247       LEAVE;
2248       goto got_it;
2249      }
2250      s = HOPc(s,1);
2251      DEBUG_TRIE_EXECUTE_r({
2252       PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2253      });
2254     } else {
2255      DEBUG_TRIE_EXECUTE_r(
2256       PerlIO_printf( Perl_debug_log,"No match.\n"));
2257      break;
2258     }
2259    }
2260    FREETMPS;
2261    LEAVE;
2262   }
2263   break;
2264  default:
2265   Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2266   break;
2267  }
2268  return 0;
2269   got_it:
2270  return s;
2271 }
2272
2273 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2274  * flags have same meanings as with regexec_flags() */
2275
2276 static void
2277 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2278        char *strbeg,
2279        char *strend,
2280        SV *sv,
2281        U32 flags,
2282        bool utf8_target)
2283 {
2284  struct regexp *const prog = ReANY(rx);
2285
2286  if (flags & REXEC_COPY_STR) {
2287 #ifdef PERL_ANY_COW
2288   if (SvCANCOW(sv)) {
2289    if (DEBUG_C_TEST) {
2290     PerlIO_printf(Perl_debug_log,
2291        "Copy on write: regexp capture, type %d\n",
2292        (int) SvTYPE(sv));
2293    }
2294    /* Create a new COW SV to share the match string and store
2295    * in saved_copy, unless the current COW SV in saved_copy
2296    * is valid and suitable for our purpose */
2297    if ((   prog->saved_copy
2298     && SvIsCOW(prog->saved_copy)
2299     && SvPOKp(prog->saved_copy)
2300     && SvIsCOW(sv)
2301     && SvPOKp(sv)
2302     && SvPVX(sv) == SvPVX(prog->saved_copy)))
2303    {
2304     /* just reuse saved_copy SV */
2305     if (RXp_MATCH_COPIED(prog)) {
2306      Safefree(prog->subbeg);
2307      RXp_MATCH_COPIED_off(prog);
2308     }
2309    }
2310    else {
2311     /* create new COW SV to share string */
2312     RX_MATCH_COPY_FREE(rx);
2313     prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2314    }
2315    prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2316    assert (SvPOKp(prog->saved_copy));
2317    prog->sublen  = strend - strbeg;
2318    prog->suboffset = 0;
2319    prog->subcoffset = 0;
2320   } else
2321 #endif
2322   {
2323    SSize_t min = 0;
2324    SSize_t max = strend - strbeg;
2325    SSize_t sublen;
2326
2327    if (    (flags & REXEC_COPY_SKIP_POST)
2328     && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2329     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2330    ) { /* don't copy $' part of string */
2331     U32 n = 0;
2332     max = -1;
2333     /* calculate the right-most part of the string covered
2334     * by a capture. Due to look-ahead, this may be to
2335     * the right of $&, so we have to scan all captures */
2336     while (n <= prog->lastparen) {
2337      if (prog->offs[n].end > max)
2338       max = prog->offs[n].end;
2339      n++;
2340     }
2341     if (max == -1)
2342      max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2343        ? prog->offs[0].start
2344        : 0;
2345     assert(max >= 0 && max <= strend - strbeg);
2346    }
2347
2348    if (    (flags & REXEC_COPY_SKIP_PRE)
2349     && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2350     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2351    ) { /* don't copy $` part of string */
2352     U32 n = 0;
2353     min = max;
2354     /* calculate the left-most part of the string covered
2355     * by a capture. Due to look-behind, this may be to
2356     * the left of $&, so we have to scan all captures */
2357     while (min && n <= prog->lastparen) {
2358      if (   prog->offs[n].start != -1
2359       && prog->offs[n].start < min)
2360      {
2361       min = prog->offs[n].start;
2362      }
2363      n++;
2364     }
2365     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2366      && min >  prog->offs[0].end
2367     )
2368      min = prog->offs[0].end;
2369
2370    }
2371
2372    assert(min >= 0 && min <= max && min <= strend - strbeg);
2373    sublen = max - min;
2374
2375    if (RX_MATCH_COPIED(rx)) {
2376     if (sublen > prog->sublen)
2377      prog->subbeg =
2378        (char*)saferealloc(prog->subbeg, sublen+1);
2379    }
2380    else
2381     prog->subbeg = (char*)safemalloc(sublen+1);
2382    Copy(strbeg + min, prog->subbeg, sublen, char);
2383    prog->subbeg[sublen] = '\0';
2384    prog->suboffset = min;
2385    prog->sublen = sublen;
2386    RX_MATCH_COPIED_on(rx);
2387   }
2388   prog->subcoffset = prog->suboffset;
2389   if (prog->suboffset && utf8_target) {
2390    /* Convert byte offset to chars.
2391    * XXX ideally should only compute this if @-/@+
2392    * has been seen, a la PL_sawampersand ??? */
2393
2394    /* If there's a direct correspondence between the
2395    * string which we're matching and the original SV,
2396    * then we can use the utf8 len cache associated with
2397    * the SV. In particular, it means that under //g,
2398    * sv_pos_b2u() will use the previously cached
2399    * position to speed up working out the new length of
2400    * subcoffset, rather than counting from the start of
2401    * the string each time. This stops
2402    *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2403    * from going quadratic */
2404    if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2405     prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2406             SV_GMAGIC|SV_CONST_RETURN);
2407    else
2408     prog->subcoffset = utf8_length((U8*)strbeg,
2409          (U8*)(strbeg+prog->suboffset));
2410   }
2411  }
2412  else {
2413   RX_MATCH_COPY_FREE(rx);
2414   prog->subbeg = strbeg;
2415   prog->suboffset = 0;
2416   prog->subcoffset = 0;
2417   prog->sublen = strend - strbeg;
2418  }
2419 }
2420
2421
2422
2423
2424 /*
2425  - regexec_flags - match a regexp against a string
2426  */
2427 I32
2428 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2429    char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2430 /* stringarg: the point in the string at which to begin matching */
2431 /* strend:    pointer to null at end of string */
2432 /* strbeg:    real beginning of string */
2433 /* minend:    end of match must be >= minend bytes after stringarg. */
2434 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2435  *            itself is accessed via the pointers above */
2436 /* data:      May be used for some additional optimizations.
2437    Currently unused. */
2438 /* flags:     For optimizations. See REXEC_* in regexp.h */
2439
2440 {
2441  dVAR;
2442  struct regexp *const prog = ReANY(rx);
2443  char *s;
2444  regnode *c;
2445  char *startpos;
2446  SSize_t minlen;  /* must match at least this many chars */
2447  SSize_t dontbother = 0; /* how many characters not to try at end */
2448  const bool utf8_target = cBOOL(DO_UTF8(sv));
2449  I32 multiline;
2450  RXi_GET_DECL(prog,progi);
2451  regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2452  regmatch_info *const reginfo = &reginfo_buf;
2453  regexp_paren_pair *swap = NULL;
2454  I32 oldsave;
2455  GET_RE_DEBUG_FLAGS_DECL;
2456
2457  PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2458  PERL_UNUSED_ARG(data);
2459
2460  /* Be paranoid... */
2461  if (prog == NULL || stringarg == NULL) {
2462   Perl_croak(aTHX_ "NULL regexp parameter");
2463   return 0;
2464  }
2465
2466  DEBUG_EXECUTE_r(
2467   debug_start_match(rx, utf8_target, stringarg, strend,
2468   "Matching");
2469  );
2470
2471  startpos = stringarg;
2472
2473  if (prog->intflags & PREGf_GPOS_SEEN) {
2474   MAGIC *mg;
2475
2476   /* set reginfo->ganch, the position where \G can match */
2477
2478   reginfo->ganch =
2479    (flags & REXEC_IGNOREPOS)
2480    ? stringarg /* use start pos rather than pos() */
2481    : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2482    /* Defined pos(): */
2483    ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2484    : strbeg; /* pos() not defined; use start of string */
2485
2486   DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2487    "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2488
2489   /* in the presence of \G, we may need to start looking earlier in
2490   * the string than the suggested start point of stringarg:
2491   * if prog->gofs is set, then that's a known, fixed minimum
2492   * offset, such as
2493   * /..\G/:   gofs = 2
2494   * /ab|c\G/: gofs = 1
2495   * or if the minimum offset isn't known, then we have to go back
2496   * to the start of the string, e.g. /w+\G/
2497   */
2498
2499   if (prog->intflags & PREGf_ANCH_GPOS) {
2500    startpos  = reginfo->ganch - prog->gofs;
2501    if (startpos <
2502     ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2503    {
2504     DEBUG_r(PerlIO_printf(Perl_debug_log,
2505       "fail: ganch-gofs before earliest possible start\n"));
2506     return 0;
2507    }
2508   }
2509   else if (prog->gofs) {
2510    if (startpos - prog->gofs < strbeg)
2511     startpos = strbeg;
2512    else
2513     startpos -= prog->gofs;
2514   }
2515   else if (prog->intflags & PREGf_GPOS_FLOAT)
2516    startpos = strbeg;
2517  }
2518
2519  minlen = prog->minlen;
2520  if ((startpos + minlen) > strend || startpos < strbeg) {
2521   DEBUG_r(PerlIO_printf(Perl_debug_log,
2522      "Regex match can't succeed, so not even tried\n"));
2523   return 0;
2524  }
2525
2526  /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2527  * which will call destuctors to reset PL_regmatch_state, free higher
2528  * PL_regmatch_slabs, and clean up regmatch_info_aux and
2529  * regmatch_info_aux_eval */
2530
2531  oldsave = PL_savestack_ix;
2532
2533  s = startpos;
2534
2535  if ((prog->extflags & RXf_USE_INTUIT)
2536   && !(flags & REXEC_CHECKED))
2537  {
2538   s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2539          flags, NULL);
2540   if (!s)
2541    return 0;
2542
2543   if (prog->extflags & RXf_CHECK_ALL) {
2544    /* we can match based purely on the result of INTUIT.
2545    * Set up captures etc just for $& and $-[0]
2546    * (an intuit-only match wont have $1,$2,..) */
2547    assert(!prog->nparens);
2548
2549    /* s/// doesn't like it if $& is earlier than where we asked it to
2550    * start searching (which can happen on something like /.\G/) */
2551    if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2552      && (s < stringarg))
2553    {
2554     /* this should only be possible under \G */
2555     assert(prog->intflags & PREGf_GPOS_SEEN);
2556     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2557      "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2558     goto phooey;
2559    }
2560
2561    /* match via INTUIT shouldn't have any captures.
2562    * Let @-, @+, $^N know */
2563    prog->lastparen = prog->lastcloseparen = 0;
2564    RX_MATCH_UTF8_set(rx, utf8_target);
2565    prog->offs[0].start = s - strbeg;
2566    prog->offs[0].end = utf8_target
2567     ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2568     : s - strbeg + prog->minlenret;
2569    if ( !(flags & REXEC_NOT_FIRST) )
2570     S_reg_set_capture_string(aTHX_ rx,
2571           strbeg, strend,
2572           sv, flags, utf8_target);
2573
2574    return 1;
2575   }
2576  }
2577
2578  multiline = prog->extflags & RXf_PMf_MULTILINE;
2579
2580  if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2581   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2582        "String too short [regexec_flags]...\n"));
2583   goto phooey;
2584  }
2585
2586  /* Check validity of program. */
2587  if (UCHARAT(progi->program) != REG_MAGIC) {
2588   Perl_croak(aTHX_ "corrupted regexp program");
2589  }
2590
2591  RX_MATCH_TAINTED_off(rx);
2592
2593  reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2594  reginfo->intuit = 0;
2595  reginfo->is_utf8_target = cBOOL(utf8_target);
2596  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2597  reginfo->warned = FALSE;
2598  reginfo->strbeg  = strbeg;
2599  reginfo->sv = sv;
2600  reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2601  reginfo->strend = strend;
2602  /* see how far we have to get to not match where we matched before */
2603  reginfo->till = stringarg + minend;
2604
2605  if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2606   /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2607   S_cleanup_regmatch_info_aux has executed (registered by
2608   SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2609   magic belonging to this SV.
2610   Not newSVsv, either, as it does not COW.
2611   */
2612   assert(!IS_PADGV(sv));
2613   reginfo->sv = newSV(0);
2614   SvSetSV_nosteal(reginfo->sv, sv);
2615   SAVEFREESV(reginfo->sv);
2616  }
2617
2618  /* reserve next 2 or 3 slots in PL_regmatch_state:
2619  * slot N+0: may currently be in use: skip it
2620  * slot N+1: use for regmatch_info_aux struct
2621  * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2622  * slot N+3: ready for use by regmatch()
2623  */
2624
2625  {
2626   regmatch_state *old_regmatch_state;
2627   regmatch_slab  *old_regmatch_slab;
2628   int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2629
2630   /* on first ever match, allocate first slab */
2631   if (!PL_regmatch_slab) {
2632    Newx(PL_regmatch_slab, 1, regmatch_slab);
2633    PL_regmatch_slab->prev = NULL;
2634    PL_regmatch_slab->next = NULL;
2635    PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2636   }
2637
2638   old_regmatch_state = PL_regmatch_state;
2639   old_regmatch_slab  = PL_regmatch_slab;
2640
2641   for (i=0; i <= max; i++) {
2642    if (i == 1)
2643     reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2644    else if (i ==2)
2645     reginfo->info_aux_eval =
2646     reginfo->info_aux->info_aux_eval =
2647        &(PL_regmatch_state->u.info_aux_eval);
2648
2649    if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
2650     PL_regmatch_state = S_push_slab(aTHX);
2651   }
2652
2653   /* note initial PL_regmatch_state position; at end of match we'll
2654   * pop back to there and free any higher slabs */
2655
2656   reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2657   reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
2658   reginfo->info_aux->poscache = NULL;
2659
2660   SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2661
2662   if ((prog->extflags & RXf_EVAL_SEEN))
2663    S_setup_eval_state(aTHX_ reginfo);
2664   else
2665    reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2666  }
2667
2668  /* If there is a "must appear" string, look for it. */
2669
2670  if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2671   /* We have to be careful. If the previous successful match
2672   was from this regex we don't want a subsequent partially
2673   successful match to clobber the old results.
2674   So when we detect this possibility we add a swap buffer
2675   to the re, and switch the buffer each match. If we fail,
2676   we switch it back; otherwise we leave it swapped.
2677   */
2678   swap = prog->offs;
2679   /* do we need a save destructor here for eval dies? */
2680   Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2681   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2682    "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2683    PTR2UV(prog),
2684    PTR2UV(swap),
2685    PTR2UV(prog->offs)
2686   ));
2687  }
2688
2689  /* Simplest case:  anchored match need be tried only once. */
2690  /*  [unless only anchor is BOL and multiline is set] */
2691  if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
2692   if (s == startpos && regtry(reginfo, &s))
2693    goto got_it;
2694   else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
2695   {
2696    char *end;
2697
2698    if (minlen)
2699     dontbother = minlen - 1;
2700    end = HOP3c(strend, -dontbother, strbeg) - 1;
2701    /* for multiline we only have to try after newlines */
2702    if (prog->check_substr || prog->check_utf8) {
2703     /* because of the goto we can not easily reuse the macros for bifurcating the
2704     unicode/non-unicode match modes here like we do elsewhere - demerphq */
2705     if (utf8_target) {
2706      if (s == startpos)
2707       goto after_try_utf8;
2708      while (1) {
2709       if (regtry(reginfo, &s)) {
2710        goto got_it;
2711       }
2712      after_try_utf8:
2713       if (s > end) {
2714        goto phooey;
2715       }
2716       if (prog->extflags & RXf_USE_INTUIT) {
2717        s = re_intuit_start(rx, sv, strbeg,
2718          s + UTF8SKIP(s), strend, flags, NULL);
2719        if (!s) {
2720         goto phooey;
2721        }
2722       }
2723       else {
2724        s += UTF8SKIP(s);
2725       }
2726      }
2727     } /* end search for check string in unicode */
2728     else {
2729      if (s == startpos) {
2730       goto after_try_latin;
2731      }
2732      while (1) {
2733       if (regtry(reginfo, &s)) {
2734        goto got_it;
2735       }
2736      after_try_latin:
2737       if (s > end) {
2738        goto phooey;
2739       }
2740       if (prog->extflags & RXf_USE_INTUIT) {
2741        s = re_intuit_start(rx, sv, strbeg,
2742           s + 1, strend, flags, NULL);
2743        if (!s) {
2744         goto phooey;
2745        }
2746       }
2747       else {
2748        s++;
2749       }
2750      }
2751     } /* end search for check string in latin*/
2752    } /* end search for check string */
2753    else { /* search for newline */
2754     if (s > startpos) {
2755      /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2756      s--;
2757     }
2758     /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2759     while (s <= end) { /* note it could be possible to match at the end of the string */
2760      if (*s++ == '\n') { /* don't need PL_utf8skip here */
2761       if (regtry(reginfo, &s))
2762        goto got_it;
2763      }
2764     }
2765    } /* end search for newline */
2766   } /* end anchored/multiline check string search */
2767   goto phooey;
2768  } else if (prog->intflags & PREGf_ANCH_GPOS)
2769  {
2770   /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2771   assert(prog->intflags & PREGf_GPOS_SEEN);
2772   /* For anchored \G, the only position it can match from is
2773   * (ganch-gofs); we already set startpos to this above; if intuit
2774   * moved us on from there, we can't possibly succeed */
2775   assert(startpos == reginfo->ganch - prog->gofs);
2776   if (s == startpos && regtry(reginfo, &s))
2777    goto got_it;
2778   goto phooey;
2779  }
2780
2781  /* Messy cases:  unanchored match. */
2782  if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2783   /* we have /x+whatever/ */
2784   /* it must be a one character string (XXXX Except is_utf8_pat?) */
2785   char ch;
2786 #ifdef DEBUGGING
2787   int did_match = 0;
2788 #endif
2789   if (utf8_target) {
2790    if (! prog->anchored_utf8) {
2791     to_utf8_substr(prog);
2792    }
2793    ch = SvPVX_const(prog->anchored_utf8)[0];
2794    REXEC_FBC_SCAN(
2795     if (*s == ch) {
2796      DEBUG_EXECUTE_r( did_match = 1 );
2797      if (regtry(reginfo, &s)) goto got_it;
2798      s += UTF8SKIP(s);
2799      while (s < strend && *s == ch)
2800       s += UTF8SKIP(s);
2801     }
2802    );
2803
2804   }
2805   else {
2806    if (! prog->anchored_substr) {
2807     if (! to_byte_substr(prog)) {
2808      NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2809     }
2810    }
2811    ch = SvPVX_const(prog->anchored_substr)[0];
2812    REXEC_FBC_SCAN(
2813     if (*s == ch) {
2814      DEBUG_EXECUTE_r( did_match = 1 );
2815      if (regtry(reginfo, &s)) goto got_it;
2816      s++;
2817      while (s < strend && *s == ch)
2818       s++;
2819     }
2820    );
2821   }
2822   DEBUG_EXECUTE_r(if (!did_match)
2823     PerlIO_printf(Perl_debug_log,
2824         "Did not find anchored character...\n")
2825    );
2826  }
2827  else if (prog->anchored_substr != NULL
2828    || prog->anchored_utf8 != NULL
2829    || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2830     && prog->float_max_offset < strend - s)) {
2831   SV *must;
2832   SSize_t back_max;
2833   SSize_t back_min;
2834   char *last;
2835   char *last1;  /* Last position checked before */
2836 #ifdef DEBUGGING
2837   int did_match = 0;
2838 #endif
2839   if (prog->anchored_substr || prog->anchored_utf8) {
2840    if (utf8_target) {
2841     if (! prog->anchored_utf8) {
2842      to_utf8_substr(prog);
2843     }
2844     must = prog->anchored_utf8;
2845    }
2846    else {
2847     if (! prog->anchored_substr) {
2848      if (! to_byte_substr(prog)) {
2849       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2850      }
2851     }
2852     must = prog->anchored_substr;
2853    }
2854    back_max = back_min = prog->anchored_offset;
2855   } else {
2856    if (utf8_target) {
2857     if (! prog->float_utf8) {
2858      to_utf8_substr(prog);
2859     }
2860     must = prog->float_utf8;
2861    }
2862    else {
2863     if (! prog->float_substr) {
2864      if (! to_byte_substr(prog)) {
2865       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2866      }
2867     }
2868     must = prog->float_substr;
2869    }
2870    back_max = prog->float_max_offset;
2871    back_min = prog->float_min_offset;
2872   }
2873
2874   if (back_min<0) {
2875    last = strend;
2876   } else {
2877    last = HOP3c(strend, /* Cannot start after this */
2878     -(SSize_t)(CHR_SVLEN(must)
2879       - (SvTAIL(must) != 0) + back_min), strbeg);
2880   }
2881   if (s > reginfo->strbeg)
2882    last1 = HOPc(s, -1);
2883   else
2884    last1 = s - 1; /* bogus */
2885
2886   /* XXXX check_substr already used to find "s", can optimize if
2887   check_substr==must. */
2888   dontbother = 0;
2889   strend = HOPc(strend, -dontbother);
2890   while ( (s <= last) &&
2891     (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
2892         (unsigned char*)strend, must,
2893         multiline ? FBMrf_MULTILINE : 0)) ) {
2894    DEBUG_EXECUTE_r( did_match = 1 );
2895    if (HOPc(s, -back_max) > last1) {
2896     last1 = HOPc(s, -back_min);
2897     s = HOPc(s, -back_max);
2898    }
2899    else {
2900     char * const t = (last1 >= reginfo->strbeg)
2901          ? HOPc(last1, 1) : last1 + 1;
2902
2903     last1 = HOPc(s, -back_min);
2904     s = t;
2905    }
2906    if (utf8_target) {
2907     while (s <= last1) {
2908      if (regtry(reginfo, &s))
2909       goto got_it;
2910      if (s >= last1) {
2911       s++; /* to break out of outer loop */
2912       break;
2913      }
2914      s += UTF8SKIP(s);
2915     }
2916    }
2917    else {
2918     while (s <= last1) {
2919      if (regtry(reginfo, &s))
2920       goto got_it;
2921      s++;
2922     }
2923    }
2924   }
2925   DEBUG_EXECUTE_r(if (!did_match) {
2926    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2927     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2928    PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2929        ((must == prog->anchored_substr || must == prog->anchored_utf8)
2930        ? "anchored" : "floating"),
2931     quoted, RE_SV_TAIL(must));
2932   });
2933   goto phooey;
2934  }
2935  else if ( (c = progi->regstclass) ) {
2936   if (minlen) {
2937    const OPCODE op = OP(progi->regstclass);
2938    /* don't bother with what can't match */
2939    if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2940     strend = HOPc(strend, -(minlen - 1));
2941   }
2942   DEBUG_EXECUTE_r({
2943    SV * const prop = sv_newmortal();
2944    regprop(prog, prop, c, reginfo);
2945    {
2946     RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2947      s,strend-s,60);
2948     PerlIO_printf(Perl_debug_log,
2949      "Matching stclass %.*s against %s (%d bytes)\n",
2950      (int)SvCUR(prop), SvPVX_const(prop),
2951      quoted, (int)(strend - s));
2952    }
2953   });
2954   if (find_byclass(prog, c, s, strend, reginfo))
2955    goto got_it;
2956   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2957  }
2958  else {
2959   dontbother = 0;
2960   if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2961    /* Trim the end. */
2962    char *last= NULL;
2963    SV* float_real;
2964    STRLEN len;
2965    const char *little;
2966
2967    if (utf8_target) {
2968     if (! prog->float_utf8) {
2969      to_utf8_substr(prog);
2970     }
2971     float_real = prog->float_utf8;
2972    }
2973    else {
2974     if (! prog->float_substr) {
2975      if (! to_byte_substr(prog)) {
2976       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2977      }
2978     }
2979     float_real = prog->float_substr;
2980    }
2981
2982    little = SvPV_const(float_real, len);
2983    if (SvTAIL(float_real)) {
2984      /* This means that float_real contains an artificial \n on
2985      * the end due to the presence of something like this:
2986      * /foo$/ where we can match both "foo" and "foo\n" at the
2987      * end of the string.  So we have to compare the end of the
2988      * string first against the float_real without the \n and
2989      * then against the full float_real with the string.  We
2990      * have to watch out for cases where the string might be
2991      * smaller than the float_real or the float_real without
2992      * the \n. */
2993      char *checkpos= strend - len;
2994      DEBUG_OPTIMISE_r(
2995       PerlIO_printf(Perl_debug_log,
2996        "%sChecking for float_real.%s\n",
2997        PL_colors[4], PL_colors[5]));
2998      if (checkpos + 1 < strbeg) {
2999       /* can't match, even if we remove the trailing \n
3000       * string is too short to match */
3001       DEBUG_EXECUTE_r(
3002        PerlIO_printf(Perl_debug_log,
3003         "%sString shorter than required trailing substring, cannot match.%s\n",
3004         PL_colors[4], PL_colors[5]));
3005       goto phooey;
3006      } else if (memEQ(checkpos + 1, little, len - 1)) {
3007       /* can match, the end of the string matches without the
3008       * "\n" */
3009       last = checkpos + 1;
3010      } else if (checkpos < strbeg) {
3011       /* cant match, string is too short when the "\n" is
3012       * included */
3013       DEBUG_EXECUTE_r(
3014        PerlIO_printf(Perl_debug_log,
3015         "%sString does not contain required trailing substring, cannot match.%s\n",
3016         PL_colors[4], PL_colors[5]));
3017       goto phooey;
3018      } else if (!multiline) {
3019       /* non multiline match, so compare with the "\n" at the
3020       * end of the string */
3021       if (memEQ(checkpos, little, len)) {
3022        last= checkpos;
3023       } else {
3024        DEBUG_EXECUTE_r(
3025         PerlIO_printf(Perl_debug_log,
3026          "%sString does not contain required trailing substring, cannot match.%s\n",
3027          PL_colors[4], PL_colors[5]));
3028        goto phooey;
3029       }
3030      } else {
3031       /* multiline match, so we have to search for a place
3032       * where the full string is located */
3033       goto find_last;
3034      }
3035    } else {
3036     find_last:
3037      if (len)
3038       last = rninstr(s, strend, little, little + len);
3039      else
3040       last = strend; /* matching "$" */
3041    }
3042    if (!last) {
3043     /* at one point this block contained a comment which was
3044     * probably incorrect, which said that this was a "should not
3045     * happen" case.  Even if it was true when it was written I am
3046     * pretty sure it is not anymore, so I have removed the comment
3047     * and replaced it with this one. Yves */
3048     DEBUG_EXECUTE_r(
3049      PerlIO_printf(Perl_debug_log,
3050       "String does not contain required substring, cannot match.\n"
3051      ));
3052     goto phooey;
3053    }
3054    dontbother = strend - last + prog->float_min_offset;
3055   }
3056   if (minlen && (dontbother < minlen))
3057    dontbother = minlen - 1;
3058   strend -= dontbother;      /* this one's always in bytes! */
3059   /* We don't know much -- general case. */
3060   if (utf8_target) {
3061    for (;;) {
3062     if (regtry(reginfo, &s))
3063      goto got_it;
3064     if (s >= strend)
3065      break;
3066     s += UTF8SKIP(s);
3067    };
3068   }
3069   else {
3070    do {
3071     if (regtry(reginfo, &s))
3072      goto got_it;
3073    } while (s++ < strend);
3074   }
3075  }
3076
3077  /* Failure. */
3078  goto phooey;
3079
3080 got_it:
3081  /* s/// doesn't like it if $& is earlier than where we asked it to
3082  * start searching (which can happen on something like /.\G/) */
3083  if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3084    && (prog->offs[0].start < stringarg - strbeg))
3085  {
3086   /* this should only be possible under \G */
3087   assert(prog->intflags & PREGf_GPOS_SEEN);
3088   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3089    "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3090   goto phooey;
3091  }
3092
3093  DEBUG_BUFFERS_r(
3094   if (swap)
3095    PerlIO_printf(Perl_debug_log,
3096     "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3097     PTR2UV(prog),
3098     PTR2UV(swap)
3099    );
3100  );
3101  Safefree(swap);
3102
3103  /* clean up; this will trigger destructors that will free all slabs
3104  * above the current one, and cleanup the regmatch_info_aux
3105  * and regmatch_info_aux_eval sructs */
3106
3107  LEAVE_SCOPE(oldsave);
3108
3109  if (RXp_PAREN_NAMES(prog))
3110   (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3111
3112  RX_MATCH_UTF8_set(rx, utf8_target);
3113
3114  /* make sure $`, $&, $', and $digit will work later */
3115  if ( !(flags & REXEC_NOT_FIRST) )
3116   S_reg_set_capture_string(aTHX_ rx,
3117          strbeg, reginfo->strend,
3118          sv, flags, utf8_target);
3119
3120  return 1;
3121
3122 phooey:
3123  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3124       PL_colors[4], PL_colors[5]));
3125
3126  /* clean up; this will trigger destructors that will free all slabs
3127  * above the current one, and cleanup the regmatch_info_aux
3128  * and regmatch_info_aux_eval sructs */
3129
3130  LEAVE_SCOPE(oldsave);
3131
3132  if (swap) {
3133   /* we failed :-( roll it back */
3134   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3135    "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3136    PTR2UV(prog),
3137    PTR2UV(prog->offs),
3138    PTR2UV(swap)
3139   ));
3140   Safefree(prog->offs);
3141   prog->offs = swap;
3142  }
3143  return 0;
3144 }
3145
3146
3147 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3148  * Do inc before dec, in case old and new rex are the same */
3149 #define SET_reg_curpm(Re2)                          \
3150  if (reginfo->info_aux_eval) {                   \
3151   (void)ReREFCNT_inc(Re2);      \
3152   ReREFCNT_dec(PM_GETRE(PL_reg_curpm));     \
3153   PM_SETRE((PL_reg_curpm), (Re2));     \
3154  }
3155
3156
3157 /*
3158  - regtry - try match at specific point
3159  */
3160 STATIC I32   /* 0 failure, 1 success */
3161 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3162 {
3163  dVAR;
3164  CHECKPOINT lastcp;
3165  REGEXP *const rx = reginfo->prog;
3166  regexp *const prog = ReANY(rx);
3167  SSize_t result;
3168  RXi_GET_DECL(prog,progi);
3169  GET_RE_DEBUG_FLAGS_DECL;
3170
3171  PERL_ARGS_ASSERT_REGTRY;
3172
3173  reginfo->cutpoint=NULL;
3174
3175  prog->offs[0].start = *startposp - reginfo->strbeg;
3176  prog->lastparen = 0;
3177  prog->lastcloseparen = 0;
3178
3179  /* XXXX What this code is doing here?!!!  There should be no need
3180  to do this again and again, prog->lastparen should take care of
3181  this!  --ilya*/
3182
3183  /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3184  * Actually, the code in regcppop() (which Ilya may be meaning by
3185  * prog->lastparen), is not needed at all by the test suite
3186  * (op/regexp, op/pat, op/split), but that code is needed otherwise
3187  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3188  * Meanwhile, this code *is* needed for the
3189  * above-mentioned test suite tests to succeed.  The common theme
3190  * on those tests seems to be returning null fields from matches.
3191  * --jhi updated by dapm */
3192 #if 1
3193  if (prog->nparens) {
3194   regexp_paren_pair *pp = prog->offs;
3195   I32 i;
3196   for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3197    ++pp;
3198    pp->start = -1;
3199    pp->end = -1;
3200   }
3201  }
3202 #endif
3203  REGCP_SET(lastcp);
3204  result = regmatch(reginfo, *startposp, progi->program + 1);
3205  if (result != -1) {
3206   prog->offs[0].end = result;
3207   return 1;
3208  }
3209  if (reginfo->cutpoint)
3210   *startposp= reginfo->cutpoint;
3211  REGCP_UNWIND(lastcp);
3212  return 0;
3213 }
3214
3215
3216 #define sayYES goto yes
3217 #define sayNO goto no
3218 #define sayNO_SILENT goto no_silent
3219
3220 /* we dont use STMT_START/END here because it leads to
3221    "unreachable code" warnings, which are bogus, but distracting. */
3222 #define CACHEsayNO \
3223  if (ST.cache_mask) \
3224  reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3225  sayNO
3226
3227 /* this is used to determine how far from the left messages like
3228    'failed...' are printed. It should be set such that messages
3229    are inline with the regop output that created them.
3230 */
3231 #define REPORT_CODE_OFF 32
3232
3233
3234 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3235 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3236 #define CHRTEST_NOT_A_CP_1 -999
3237 #define CHRTEST_NOT_A_CP_2 -998
3238
3239 /* grab a new slab and return the first slot in it */
3240
3241 STATIC regmatch_state *
3242 S_push_slab(pTHX)
3243 {
3244 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3245  dMY_CXT;
3246 #endif
3247  regmatch_slab *s = PL_regmatch_slab->next;
3248  if (!s) {
3249   Newx(s, 1, regmatch_slab);
3250   s->prev = PL_regmatch_slab;
3251   s->next = NULL;
3252   PL_regmatch_slab->next = s;
3253  }
3254  PL_regmatch_slab = s;
3255  return SLAB_FIRST(s);
3256 }
3257
3258
3259 /* push a new state then goto it */
3260
3261 #define PUSH_STATE_GOTO(state, node, input) \
3262  pushinput = input; \
3263  scan = node; \
3264  st->resume_state = state; \
3265  goto push_state;
3266
3267 /* push a new state with success backtracking, then goto it */
3268
3269 #define PUSH_YES_STATE_GOTO(state, node, input) \
3270  pushinput = input; \
3271  scan = node; \
3272  st->resume_state = state; \
3273  goto push_yes_state;
3274
3275
3276
3277
3278 /*
3279
3280 regmatch() - main matching routine
3281
3282 This is basically one big switch statement in a loop. We execute an op,
3283 set 'next' to point the next op, and continue. If we come to a point which
3284 we may need to backtrack to on failure such as (A|B|C), we push a
3285 backtrack state onto the backtrack stack. On failure, we pop the top
3286 state, and re-enter the loop at the state indicated. If there are no more
3287 states to pop, we return failure.
3288
3289 Sometimes we also need to backtrack on success; for example /A+/, where
3290 after successfully matching one A, we need to go back and try to
3291 match another one; similarly for lookahead assertions: if the assertion
3292 completes successfully, we backtrack to the state just before the assertion
3293 and then carry on.  In these cases, the pushed state is marked as
3294 'backtrack on success too'. This marking is in fact done by a chain of
3295 pointers, each pointing to the previous 'yes' state. On success, we pop to
3296 the nearest yes state, discarding any intermediate failure-only states.
3297 Sometimes a yes state is pushed just to force some cleanup code to be
3298 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3299 it to free the inner regex.
3300
3301 Note that failure backtracking rewinds the cursor position, while
3302 success backtracking leaves it alone.
3303
3304 A pattern is complete when the END op is executed, while a subpattern
3305 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3306 ops trigger the "pop to last yes state if any, otherwise return true"
3307 behaviour.
3308
3309 A common convention in this function is to use A and B to refer to the two
3310 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3311 the subpattern to be matched possibly multiple times, while B is the entire
3312 rest of the pattern. Variable and state names reflect this convention.
3313
3314 The states in the main switch are the union of ops and failure/success of
3315 substates associated with with that op.  For example, IFMATCH is the op
3316 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3317 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3318 successfully matched A and IFMATCH_A_fail is a state saying that we have
3319 just failed to match A. Resume states always come in pairs. The backtrack
3320 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3321 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3322 on success or failure.
3323
3324 The struct that holds a backtracking state is actually a big union, with
3325 one variant for each major type of op. The variable st points to the
3326 top-most backtrack struct. To make the code clearer, within each
3327 block of code we #define ST to alias the relevant union.
3328
3329 Here's a concrete example of a (vastly oversimplified) IFMATCH
3330 implementation:
3331
3332  switch (state) {
3333  ....
3334
3335 #define ST st->u.ifmatch
3336
3337  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3338   ST.foo = ...; // some state we wish to save
3339   ...
3340   // push a yes backtrack state with a resume value of
3341   // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3342   // first node of A:
3343   PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3344   // NOTREACHED
3345
3346  case IFMATCH_A: // we have successfully executed A; now continue with B
3347   next = B;
3348   bar = ST.foo; // do something with the preserved value
3349   break;
3350
3351  case IFMATCH_A_fail: // A failed, so the assertion failed
3352   ...;   // do some housekeeping, then ...
3353   sayNO; // propagate the failure
3354
3355 #undef ST
3356
3357  ...
3358  }
3359
3360 For any old-timers reading this who are familiar with the old recursive
3361 approach, the code above is equivalent to:
3362
3363  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3364  {
3365   int foo = ...
3366   ...
3367   if (regmatch(A)) {
3368    next = B;
3369    bar = foo;
3370    break;
3371   }
3372   ...;   // do some housekeeping, then ...
3373   sayNO; // propagate the failure
3374  }
3375
3376 The topmost backtrack state, pointed to by st, is usually free. If you
3377 want to claim it, populate any ST.foo fields in it with values you wish to
3378 save, then do one of
3379
3380   PUSH_STATE_GOTO(resume_state, node, newinput);
3381   PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3382
3383 which sets that backtrack state's resume value to 'resume_state', pushes a
3384 new free entry to the top of the backtrack stack, then goes to 'node'.
3385 On backtracking, the free slot is popped, and the saved state becomes the
3386 new free state. An ST.foo field in this new top state can be temporarily
3387 accessed to retrieve values, but once the main loop is re-entered, it
3388 becomes available for reuse.
3389
3390 Note that the depth of the backtrack stack constantly increases during the
3391 left-to-right execution of the pattern, rather than going up and down with
3392 the pattern nesting. For example the stack is at its maximum at Z at the
3393 end of the pattern, rather than at X in the following:
3394
3395  /(((X)+)+)+....(Y)+....Z/
3396
3397 The only exceptions to this are lookahead/behind assertions and the cut,
3398 (?>A), which pop all the backtrack states associated with A before
3399 continuing.
3400
3401 Backtrack state structs are allocated in slabs of about 4K in size.
3402 PL_regmatch_state and st always point to the currently active state,
3403 and PL_regmatch_slab points to the slab currently containing
3404 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3405 allocated, and is never freed until interpreter destruction. When the slab
3406 is full, a new one is allocated and chained to the end. At exit from
3407 regmatch(), slabs allocated since entry are freed.
3408
3409 */
3410
3411
3412 #define DEBUG_STATE_pp(pp)        \
3413  DEBUG_STATE_r({         \
3414   DUMP_EXEC_POS(locinput, scan, utf8_target);         \
3415   PerlIO_printf(Perl_debug_log,       \
3416    "    %*s"pp" %s%s%s%s%s\n",       \
3417    depth*2, "",        \
3418    PL_reg_name[st->resume_state],                  \
3419    ((st==yes_state||st==mark_state) ? "[" : ""),   \
3420    ((st==yes_state) ? "Y" : ""),                   \
3421    ((st==mark_state) ? "M" : ""),                  \
3422    ((st==yes_state||st==mark_state) ? "]" : "")    \
3423   );                                                  \
3424  });
3425
3426
3427 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3428
3429 #ifdef DEBUGGING
3430
3431 STATIC void
3432 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3433  const char *start, const char *end, const char *blurb)
3434 {
3435  const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3436
3437  PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3438
3439  if (!PL_colorset)
3440    reginitcolors();
3441  {
3442   RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3443    RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3444
3445   RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3446    start, end - start, 60);
3447
3448   PerlIO_printf(Perl_debug_log,
3449    "%s%s REx%s %s against %s\n",
3450      PL_colors[4], blurb, PL_colors[5], s0, s1);
3451
3452   if (utf8_target||utf8_pat)
3453    PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3454     utf8_pat ? "pattern" : "",
3455     utf8_pat && utf8_target ? " and " : "",
3456     utf8_target ? "string" : ""
3457    );
3458  }
3459 }
3460
3461 STATIC void
3462 S_dump_exec_pos(pTHX_ const char *locinput,
3463      const regnode *scan,
3464      const char *loc_regeol,
3465      const char *loc_bostr,
3466      const char *loc_reg_starttry,
3467      const bool utf8_target)
3468 {
3469  const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3470  const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3471  int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3472  /* The part of the string before starttry has one color
3473  (pref0_len chars), between starttry and current
3474  position another one (pref_len - pref0_len chars),
3475  after the current position the third one.
3476  We assume that pref0_len <= pref_len, otherwise we
3477  decrease pref0_len.  */
3478  int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3479   ? (5 + taill) - l : locinput - loc_bostr;
3480  int pref0_len;
3481
3482  PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3483
3484  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3485   pref_len++;
3486  pref0_len = pref_len  - (locinput - loc_reg_starttry);
3487  if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3488   l = ( loc_regeol - locinput > (5 + taill) - pref_len
3489    ? (5 + taill) - pref_len : loc_regeol - locinput);
3490  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3491   l--;
3492  if (pref0_len < 0)
3493   pref0_len = 0;
3494  if (pref0_len > pref_len)
3495   pref0_len = pref_len;
3496  {
3497   const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3498
3499   RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3500    (locinput - pref_len),pref0_len, 60, 4, 5);
3501
3502   RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3503      (locinput - pref_len + pref0_len),
3504      pref_len - pref0_len, 60, 2, 3);
3505
3506   RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3507      locinput, loc_regeol - locinput, 10, 0, 1);
3508
3509   const STRLEN tlen=len0+len1+len2;
3510   PerlIO_printf(Perl_debug_log,
3511      "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3512      (IV)(locinput - loc_bostr),
3513      len0, s0,
3514      len1, s1,
3515      (docolor ? "" : "> <"),
3516      len2, s2,
3517      (int)(tlen > 19 ? 0 :  19 - tlen),
3518      "");
3519  }
3520 }
3521
3522 #endif
3523
3524 /* reg_check_named_buff_matched()
3525  * Checks to see if a named buffer has matched. The data array of
3526  * buffer numbers corresponding to the buffer is expected to reside
3527  * in the regexp->data->data array in the slot stored in the ARG() of
3528  * node involved. Note that this routine doesn't actually care about the
3529  * name, that information is not preserved from compilation to execution.
3530  * Returns the index of the leftmost defined buffer with the given name
3531  * or 0 if non of the buffers matched.
3532  */
3533 STATIC I32
3534 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3535 {
3536  I32 n;
3537  RXi_GET_DECL(rex,rexi);
3538  SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3539  I32 *nums=(I32*)SvPVX(sv_dat);
3540
3541  PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3542
3543  for ( n=0; n<SvIVX(sv_dat); n++ ) {
3544   if ((I32)rex->lastparen >= nums[n] &&
3545    rex->offs[nums[n]].end != -1)
3546   {
3547    return nums[n];
3548   }
3549  }
3550  return 0;
3551 }
3552
3553
3554 static bool
3555 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3556   U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3557 {
3558  /* This function determines if there are one or two characters that match
3559  * the first character of the passed-in EXACTish node <text_node>, and if
3560  * so, returns them in the passed-in pointers.
3561  *
3562  * If it determines that no possible character in the target string can
3563  * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3564  * the first character in <text_node> requires UTF-8 to represent, and the
3565  * target string isn't in UTF-8.)
3566  *
3567  * If there are more than two characters that could match the beginning of
3568  * <text_node>, or if more context is required to determine a match or not,
3569  * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3570  *
3571  * The motiviation behind this function is to allow the caller to set up
3572  * tight loops for matching.  If <text_node> is of type EXACT, there is
3573  * only one possible character that can match its first character, and so
3574  * the situation is quite simple.  But things get much more complicated if
3575  * folding is involved.  It may be that the first character of an EXACTFish
3576  * node doesn't participate in any possible fold, e.g., punctuation, so it
3577  * can be matched only by itself.  The vast majority of characters that are
3578  * in folds match just two things, their lower and upper-case equivalents.
3579  * But not all are like that; some have multiple possible matches, or match
3580  * sequences of more than one character.  This function sorts all that out.
3581  *
3582  * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3583  * loop of trying to match A*, we know we can't exit where the thing
3584  * following it isn't a B.  And something can't be a B unless it is the
3585  * beginning of B.  By putting a quick test for that beginning in a tight
3586  * loop, we can rule out things that can't possibly be B without having to
3587  * break out of the loop, thus avoiding work.  Similarly, if A is a single
3588  * character, we can make a tight loop matching A*, using the outputs of
3589  * this function.
3590  *
3591  * If the target string to match isn't in UTF-8, and there aren't
3592  * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3593  * the one or two possible octets (which are characters in this situation)
3594  * that can match.  In all cases, if there is only one character that can
3595  * match, *<c1p> and *<c2p> will be identical.
3596  *
3597  * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3598  * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3599  * can match the beginning of <text_node>.  They should be declared with at
3600  * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3601  * undefined what these contain.)  If one or both of the buffers are
3602  * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3603  * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3604  * *<c2p> will be set to a negative number(s) that shouldn't match any code
3605  * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3606  * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3607
3608  const bool utf8_target = reginfo->is_utf8_target;
3609
3610  UV c1 = CHRTEST_NOT_A_CP_1;
3611  UV c2 = CHRTEST_NOT_A_CP_2;
3612  bool use_chrtest_void = FALSE;
3613  const bool is_utf8_pat = reginfo->is_utf8_pat;
3614
3615  /* Used when we have both utf8 input and utf8 output, to avoid converting
3616  * to/from code points */
3617  bool utf8_has_been_setup = FALSE;
3618
3619  dVAR;
3620
3621  U8 *pat = (U8*)STRING(text_node);
3622  U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3623
3624  if (OP(text_node) == EXACT) {
3625
3626   /* In an exact node, only one thing can be matched, that first
3627   * character.  If both the pat and the target are UTF-8, we can just
3628   * copy the input to the output, avoiding finding the code point of
3629   * that character */
3630   if (!is_utf8_pat) {
3631    c2 = c1 = *pat;
3632   }
3633   else if (utf8_target) {
3634    Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3635    Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3636    utf8_has_been_setup = TRUE;
3637   }
3638   else {
3639    c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3640   }
3641  }
3642  else { /* an EXACTFish node */
3643   U8 *pat_end = pat + STR_LEN(text_node);
3644
3645   /* An EXACTFL node has at least some characters unfolded, because what
3646   * they match is not known until now.  So, now is the time to fold
3647   * the first few of them, as many as are needed to determine 'c1' and
3648   * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
3649   * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3650   * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
3651   * need to fold as many characters as a single character can fold to,
3652   * so that later we can check if the first ones are such a multi-char
3653   * fold.  But, in such a pattern only locale-problematic characters
3654   * aren't folded, so we can skip this completely if the first character
3655   * in the node isn't one of the tricky ones */
3656   if (OP(text_node) == EXACTFL) {
3657
3658    if (! is_utf8_pat) {
3659     if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3660     {
3661      folded[0] = folded[1] = 's';
3662      pat = folded;
3663      pat_end = folded + 2;
3664     }
3665    }
3666    else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3667     U8 *s = pat;
3668     U8 *d = folded;
3669     int i;
3670
3671     for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3672      if (isASCII(*s)) {
3673       *(d++) = (U8) toFOLD_LC(*s);
3674       s++;
3675      }
3676      else {
3677       STRLEN len;
3678       _to_utf8_fold_flags(s,
3679            d,
3680            &len,
3681            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3682       d += len;
3683       s += UTF8SKIP(s);
3684      }
3685     }
3686
3687     pat = folded;
3688     pat_end = d;
3689    }
3690   }
3691
3692   if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
3693    || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
3694   {
3695    /* Multi-character folds require more context to sort out.  Also
3696    * PL_utf8_foldclosures used below doesn't handle them, so have to
3697    * be handled outside this routine */
3698    use_chrtest_void = TRUE;
3699   }
3700   else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3701    c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3702    if (c1 > 256) {
3703     /* Load the folds hash, if not already done */
3704     SV** listp;
3705     if (! PL_utf8_foldclosures) {
3706      if (! PL_utf8_tofold) {
3707       U8 dummy[UTF8_MAXBYTES_CASE+1];
3708
3709       /* Force loading this by folding an above-Latin1 char */
3710       to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3711       assert(PL_utf8_tofold); /* Verify that worked */
3712      }
3713      PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3714     }
3715
3716     /* The fold closures data structure is a hash with the keys
3717     * being the UTF-8 of every character that is folded to, like
3718     * 'k', and the values each an array of all code points that
3719     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
3720     * Multi-character folds are not included */
3721     if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3722           (char *) pat,
3723           UTF8SKIP(pat),
3724           FALSE))))
3725     {
3726      /* Not found in the hash, therefore there are no folds
3727      * containing it, so there is only a single character that
3728      * could match */
3729      c2 = c1;
3730     }
3731     else {  /* Does participate in folds */
3732      AV* list = (AV*) *listp;
3733      if (av_tindex(list) != 1) {
3734
3735       /* If there aren't exactly two folds to this, it is
3736       * outside the scope of this function */
3737       use_chrtest_void = TRUE;
3738      }
3739      else {  /* There are two.  Get them */
3740       SV** c_p = av_fetch(list, 0, FALSE);
3741       if (c_p == NULL) {
3742        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3743       }
3744       c1 = SvUV(*c_p);
3745
3746       c_p = av_fetch(list, 1, FALSE);
3747       if (c_p == NULL) {
3748        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3749       }
3750       c2 = SvUV(*c_p);
3751
3752       /* Folds that cross the 255/256 boundary are forbidden
3753       * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
3754       * one is ASCIII.  Since the pattern character is above
3755       * 256, and its only other match is below 256, the only
3756       * legal match will be to itself.  We have thrown away
3757       * the original, so have to compute which is the one
3758       * above 255 */
3759       if ((c1 < 256) != (c2 < 256)) {
3760        if ((OP(text_node) == EXACTFL
3761         && ! IN_UTF8_CTYPE_LOCALE)
3762         || ((OP(text_node) == EXACTFA
3763          || OP(text_node) == EXACTFA_NO_TRIE)
3764          && (isASCII(c1) || isASCII(c2))))
3765        {
3766         if (c1 < 256) {
3767          c1 = c2;
3768         }
3769         else {
3770          c2 = c1;
3771         }
3772        }
3773       }
3774      }
3775     }
3776    }
3777    else /* Here, c1 is < 255 */
3778     if (utf8_target
3779      && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3780      && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
3781      && ((OP(text_node) != EXACTFA
3782       && OP(text_node) != EXACTFA_NO_TRIE)
3783       || ! isASCII(c1)))
3784    {
3785     /* Here, there could be something above Latin1 in the target
3786     * which folds to this character in the pattern.  All such
3787     * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
3788     * than two characters involved in their folds, so are outside
3789     * the scope of this function */
3790     if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3791      c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3792     }
3793     else {
3794      use_chrtest_void = TRUE;
3795     }
3796    }
3797    else { /* Here nothing above Latin1 can fold to the pattern
3798      character */
3799     switch (OP(text_node)) {
3800
3801      case EXACTFL:   /* /l rules */
3802       c2 = PL_fold_locale[c1];
3803       break;
3804
3805      case EXACTF:   /* This node only generated for non-utf8
3806          patterns */
3807       assert(! is_utf8_pat);
3808       if (! utf8_target) {    /* /d rules */
3809        c2 = PL_fold[c1];
3810        break;
3811       }
3812       /* FALLTHROUGH */
3813       /* /u rules for all these.  This happens to work for
3814       * EXACTFA as nothing in Latin1 folds to ASCII */
3815      case EXACTFA_NO_TRIE:   /* This node only generated for
3816            non-utf8 patterns */
3817       assert(! is_utf8_pat);
3818       /* FALL THROUGH */
3819      case EXACTFA:
3820      case EXACTFU_SS:
3821      case EXACTFU:
3822       c2 = PL_fold_latin1[c1];
3823       break;
3824
3825      default:
3826       Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3827       assert(0); /* NOTREACHED */
3828     }
3829    }
3830   }
3831  }
3832
3833  /* Here have figured things out.  Set up the returns */
3834  if (use_chrtest_void) {
3835   *c2p = *c1p = CHRTEST_VOID;
3836  }
3837  else if (utf8_target) {
3838   if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
3839    uvchr_to_utf8(c1_utf8, c1);
3840    uvchr_to_utf8(c2_utf8, c2);
3841   }
3842
3843   /* Invariants are stored in both the utf8 and byte outputs; Use
3844   * negative numbers otherwise for the byte ones.  Make sure that the
3845   * byte ones are the same iff the utf8 ones are the same */
3846   *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3847   *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3848     ? *c2_utf8
3849     : (c1 == c2)
3850     ? CHRTEST_NOT_A_CP_1
3851     : CHRTEST_NOT_A_CP_2;
3852  }
3853  else if (c1 > 255) {
3854  if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
3855       can represent */
3856   return FALSE;
3857  }
3858
3859  *c1p = *c2p = c2;    /* c2 is the only representable value */
3860  }
3861  else {  /* c1 is representable; see about c2 */
3862  *c1p = c1;
3863  *c2p = (c2 < 256) ? c2 : c1;
3864  }
3865
3866  return TRUE;
3867 }
3868
3869 /* returns -1 on failure, $+[0] on success */
3870 STATIC SSize_t
3871 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3872 {
3873 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3874  dMY_CXT;
3875 #endif
3876  dVAR;
3877  const bool utf8_target = reginfo->is_utf8_target;
3878  const U32 uniflags = UTF8_ALLOW_DEFAULT;
3879  REGEXP *rex_sv = reginfo->prog;
3880  regexp *rex = ReANY(rex_sv);
3881  RXi_GET_DECL(rex,rexi);
3882  /* the current state. This is a cached copy of PL_regmatch_state */
3883  regmatch_state *st;
3884  /* cache heavy used fields of st in registers */
3885  regnode *scan;
3886  regnode *next;
3887  U32 n = 0; /* general value; init to avoid compiler warning */
3888  SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
3889  char *locinput = startpos;
3890  char *pushinput; /* where to continue after a PUSH */
3891  I32 nextchr;   /* is always set to UCHARAT(locinput) */
3892
3893  bool result = 0;     /* return value of S_regmatch */
3894  int depth = 0;     /* depth of backtrack stack */
3895  U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3896  const U32 max_nochange_depth =
3897   (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3898   3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3899  regmatch_state *yes_state = NULL; /* state to pop to on success of
3900                subpattern */
3901  /* mark_state piggy backs on the yes_state logic so that when we unwind
3902  the stack on success we can update the mark_state as we go */
3903  regmatch_state *mark_state = NULL; /* last mark state we have seen */
3904  regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3905  struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
3906  U32 state_num;
3907  bool no_final = 0;      /* prevent failure from backtracking? */
3908  bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
3909  char *startpoint = locinput;
3910  SV *popmark = NULL;     /* are we looking for a mark? */
3911  SV *sv_commit = NULL;   /* last mark name seen in failure */
3912  SV *sv_yes_mark = NULL; /* last mark name we have seen
3913        during a successful match */
3914  U32 lastopen = 0;       /* last open we saw */
3915  bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3916  SV* const oreplsv = GvSVn(PL_replgv);
3917  /* these three flags are set by various ops to signal information to
3918  * the very next op. They have a useful lifetime of exactly one loop
3919  * iteration, and are not preserved or restored by state pushes/pops
3920  */
3921  bool sw = 0;     /* the condition value in (?(cond)a|b) */
3922  bool minmod = 0;     /* the next "{n,m}" is a "{n,m}?" */
3923  int logical = 0;     /* the following EVAL is:
3924         0: (?{...})
3925         1: (?(?{...})X|Y)
3926         2: (??{...})
3927        or the following IFMATCH/UNLESSM is:
3928         false: plain (?=foo)
3929         true:  used as a condition: (?(?=foo))
3930        */
3931  PAD* last_pad = NULL;
3932  dMULTICALL;
3933  I32 gimme = G_SCALAR;
3934  CV *caller_cv = NULL; /* who called us */
3935  CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3936  CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3937  U32 maxopenparen = 0;       /* max '(' index seen so far */
3938  int to_complement;  /* Invert the result? */
3939  _char_class_number classnum;
3940  bool is_utf8_pat = reginfo->is_utf8_pat;
3941
3942 #ifdef DEBUGGING
3943  GET_RE_DEBUG_FLAGS_DECL;
3944 #endif
3945
3946  /* protect against undef(*^R) */
3947  SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
3948
3949  /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3950  multicall_oldcatch = 0;
3951  multicall_cv = NULL;
3952  cx = NULL;
3953  PERL_UNUSED_VAR(multicall_cop);
3954  PERL_UNUSED_VAR(newsp);
3955
3956
3957  PERL_ARGS_ASSERT_REGMATCH;
3958
3959  DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3960    PerlIO_printf(Perl_debug_log,"regmatch start\n");
3961  }));
3962
3963  st = PL_regmatch_state;
3964
3965  /* Note that nextchr is a byte even in UTF */
3966  SET_nextchr;
3967  scan = prog;
3968  while (scan != NULL) {
3969
3970   DEBUG_EXECUTE_r( {
3971    SV * const prop = sv_newmortal();
3972    regnode *rnext=regnext(scan);
3973    DUMP_EXEC_POS( locinput, scan, utf8_target );
3974    regprop(rex, prop, scan, reginfo);
3975
3976    PerlIO_printf(Perl_debug_log,
3977      "%3"IVdf":%*s%s(%"IVdf")\n",
3978      (IV)(scan - rexi->program), depth*2, "",
3979      SvPVX_const(prop),
3980      (PL_regkind[OP(scan)] == END || !rnext) ?
3981       0 : (IV)(rnext - rexi->program));
3982   });
3983
3984   next = scan + NEXT_OFF(scan);
3985   if (next == scan)
3986    next = NULL;
3987   state_num = OP(scan);
3988
3989   REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3990  reenter_switch:
3991   to_complement = 0;
3992
3993   SET_nextchr;
3994   assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3995
3996   switch (state_num) {
3997   case BOL:  /*  /^../   */
3998   case SBOL: /*  /^../s  */
3999    if (locinput == reginfo->strbeg)
4000     break;
4001    sayNO;
4002
4003   case MBOL: /*  /^../m  */
4004    if (locinput == reginfo->strbeg ||
4005     (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
4006    {
4007     break;
4008    }
4009    sayNO;
4010
4011   case GPOS: /*  \G  */
4012    if (locinput == reginfo->ganch)
4013     break;
4014    sayNO;
4015
4016   case KEEPS: /*   \K  */
4017    /* update the startpoint */
4018    st->u.keeper.val = rex->offs[0].start;
4019    rex->offs[0].start = locinput - reginfo->strbeg;
4020    PUSH_STATE_GOTO(KEEPS_next, next, locinput);
4021    assert(0); /*NOTREACHED*/
4022   case KEEPS_next_fail:
4023    /* rollback the start point change */
4024    rex->offs[0].start = st->u.keeper.val;
4025    sayNO_SILENT;
4026    assert(0); /*NOTREACHED*/
4027
4028   case MEOL: /* /..$/m  */
4029    if (!NEXTCHR_IS_EOS && nextchr != '\n')
4030     sayNO;
4031    break;
4032
4033   case EOL: /* /..$/  */
4034    /* FALL THROUGH */
4035   case SEOL: /* /..$/s  */
4036    if (!NEXTCHR_IS_EOS && nextchr != '\n')
4037     sayNO;
4038    if (reginfo->strend - locinput > 1)
4039     sayNO;
4040    break;
4041
4042   case EOS: /*  \z  */
4043    if (!NEXTCHR_IS_EOS)
4044     sayNO;
4045    break;
4046
4047   case SANY: /*  /./s  */
4048    if (NEXTCHR_IS_EOS)
4049     sayNO;
4050    goto increment_locinput;
4051
4052   case CANY: /*  \C  */
4053    if (NEXTCHR_IS_EOS)
4054     sayNO;
4055    locinput++;
4056    break;
4057
4058   case REG_ANY: /*  /./  */
4059    if ((NEXTCHR_IS_EOS) || nextchr == '\n')
4060     sayNO;
4061    goto increment_locinput;
4062
4063
4064 #undef  ST
4065 #define ST st->u.trie
4066   case TRIEC: /* (ab|cd) with known charclass */
4067    /* In this case the charclass data is available inline so
4068    we can fail fast without a lot of extra overhead.
4069    */
4070    if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
4071     DEBUG_EXECUTE_r(
4072      PerlIO_printf(Perl_debug_log,
4073        "%*s  %sfailed to match trie start class...%s\n",
4074        REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4075     );
4076     sayNO_SILENT;
4077     assert(0); /* NOTREACHED */
4078    }
4079    /* FALL THROUGH */
4080   case TRIE:  /* (ab|cd)  */
4081    /* the basic plan of execution of the trie is:
4082    * At the beginning, run though all the states, and
4083    * find the longest-matching word. Also remember the position
4084    * of the shortest matching word. For example, this pattern:
4085    *    1  2 3 4    5
4086    *    ab|a|x|abcd|abc
4087    * when matched against the string "abcde", will generate
4088    * accept states for all words except 3, with the longest
4089    * matching word being 4, and the shortest being 2 (with
4090    * the position being after char 1 of the string).
4091    *
4092    * Then for each matching word, in word order (i.e. 1,2,4,5),
4093    * we run the remainder of the pattern; on each try setting
4094    * the current position to the character following the word,
4095    * returning to try the next word on failure.
4096    *
4097    * We avoid having to build a list of words at runtime by
4098    * using a compile-time structure, wordinfo[].prev, which
4099    * gives, for each word, the previous accepting word (if any).
4100    * In the case above it would contain the mappings 1->2, 2->0,
4101    * 3->0, 4->5, 5->1.  We can use this table to generate, from
4102    * the longest word (4 above), a list of all words, by
4103    * following the list of prev pointers; this gives us the
4104    * unordered list 4,5,1,2. Then given the current word we have
4105    * just tried, we can go through the list and find the
4106    * next-biggest word to try (so if we just failed on word 2,
4107    * the next in the list is 4).
4108    *
4109    * Since at runtime we don't record the matching position in
4110    * the string for each word, we have to work that out for
4111    * each word we're about to process. The wordinfo table holds
4112    * the character length of each word; given that we recorded
4113    * at the start: the position of the shortest word and its
4114    * length in chars, we just need to move the pointer the
4115    * difference between the two char lengths. Depending on
4116    * Unicode status and folding, that's cheap or expensive.
4117    *
4118    * This algorithm is optimised for the case where are only a
4119    * small number of accept states, i.e. 0,1, or maybe 2.
4120    * With lots of accepts states, and having to try all of them,
4121    * it becomes quadratic on number of accept states to find all
4122    * the next words.
4123    */
4124
4125    {
4126     /* what type of TRIE am I? (utf8 makes this contextual) */
4127     DECL_TRIE_TYPE(scan);
4128
4129     /* what trie are we using right now */
4130     reg_trie_data * const trie
4131      = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
4132     HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
4133     U32 state = trie->startstate;
4134
4135     if (   trie->bitmap
4136      && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
4137     {
4138      if (trie->states[ state ].wordnum) {
4139       DEBUG_EXECUTE_r(
4140        PerlIO_printf(Perl_debug_log,
4141           "%*s  %smatched empty string...%s\n",
4142           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4143       );
4144       if (!trie->jump)
4145        break;
4146      } else {
4147       DEBUG_EXECUTE_r(
4148        PerlIO_printf(Perl_debug_log,
4149           "%*s  %sfailed to match trie start class...%s\n",
4150           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4151       );
4152       sayNO_SILENT;
4153     }
4154     }
4155
4156    {
4157     U8 *uc = ( U8* )locinput;
4158
4159     STRLEN len = 0;
4160     STRLEN foldlen = 0;
4161     U8 *uscan = (U8*)NULL;
4162     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
4163     U32 charcount = 0; /* how many input chars we have matched */
4164     U32 accepted = 0; /* have we seen any accepting states? */
4165
4166     ST.jump = trie->jump;
4167     ST.me = scan;
4168     ST.firstpos = NULL;
4169     ST.longfold = FALSE; /* char longer if folded => it's harder */
4170     ST.nextword = 0;
4171
4172     /* fully traverse the TRIE; note the position of the
4173     shortest accept state and the wordnum of the longest
4174     accept state */
4175
4176     while ( state && uc <= (U8*)(reginfo->strend) ) {
4177      U32 base = trie->states[ state ].trans.base;
4178      UV uvc = 0;
4179      U16 charid = 0;
4180      U16 wordnum;
4181      wordnum = trie->states[ state ].wordnum;
4182
4183      if (wordnum) { /* it's an accept state */
4184       if (!accepted) {
4185        accepted = 1;
4186        /* record first match position */
4187        if (ST.longfold) {
4188         ST.firstpos = (U8*)locinput;
4189         ST.firstchars = 0;
4190        }
4191        else {
4192         ST.firstpos = uc;
4193         ST.firstchars = charcount;
4194        }
4195       }
4196       if (!ST.nextword || wordnum < ST.nextword)
4197        ST.nextword = wordnum;
4198       ST.topword = wordnum;
4199      }
4200
4201      DEBUG_TRIE_EXECUTE_r({
4202         DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
4203         PerlIO_printf( Perl_debug_log,
4204          "%*s  %sState: %4"UVxf" Accepted: %c ",
4205          2+depth * 2, "", PL_colors[4],
4206          (UV)state, (accepted ? 'Y' : 'N'));
4207      });
4208
4209      /* read a char and goto next state */
4210      if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
4211       I32 offset;
4212       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
4213            uscan, len, uvc, charid, foldlen,
4214            foldbuf, uniflags);
4215       charcount++;
4216       if (foldlen>0)
4217        ST.longfold = TRUE;
4218       if (charid &&
4219        ( ((offset =
4220        base + charid - 1 - trie->uniquecharcount)) >= 0)
4221
4222        && ((U32)offset < trie->lasttrans)
4223        && trie->trans[offset].check == state)
4224       {
4225        state = trie->trans[offset].next;
4226       }
4227       else {
4228        state = 0;
4229       }
4230       uc += len;
4231
4232      }
4233      else {
4234       state = 0;
4235      }
4236      DEBUG_TRIE_EXECUTE_r(
4237       PerlIO_printf( Perl_debug_log,
4238        "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
4239        charid, uvc, (UV)state, PL_colors[5] );
4240      );
4241     }
4242     if (!accepted)
4243     sayNO;
4244
4245     /* calculate total number of accept states */
4246     {
4247      U16 w = ST.topword;
4248      accepted = 0;
4249      while (w) {
4250       w = trie->wordinfo[w].prev;
4251       accepted++;
4252      }
4253      ST.accepted = accepted;
4254     }
4255
4256     DEBUG_EXECUTE_r(
4257      PerlIO_printf( Perl_debug_log,
4258       "%*s  %sgot %"IVdf" possible matches%s\n",
4259       REPORT_CODE_OFF + depth * 2, "",
4260       PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4261     );
4262     goto trie_first_try; /* jump into the fail handler */
4263    }}
4264    assert(0); /* NOTREACHED */
4265
4266   case TRIE_next_fail: /* we failed - try next alternative */
4267   {
4268    U8 *uc;
4269    if ( ST.jump) {
4270     REGCP_UNWIND(ST.cp);
4271     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
4272    }
4273    if (!--ST.accepted) {
4274     DEBUG_EXECUTE_r({
4275      PerlIO_printf( Perl_debug_log,
4276       "%*s  %sTRIE failed...%s\n",
4277       REPORT_CODE_OFF+depth*2, "",
4278       PL_colors[4],
4279       PL_colors[5] );
4280     });
4281     sayNO_SILENT;
4282    }
4283    {
4284     /* Find next-highest word to process.  Note that this code
4285     * is O(N^2) per trie run (O(N) per branch), so keep tight */
4286     U16 min = 0;
4287     U16 word;
4288     U16 const nextword = ST.nextword;
4289     reg_trie_wordinfo * const wordinfo
4290      = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4291     for (word=ST.topword; word; word=wordinfo[word].prev) {
4292      if (word > nextword && (!min || word < min))
4293       min = word;
4294     }
4295     ST.nextword = min;
4296    }
4297
4298   trie_first_try:
4299    if (do_cutgroup) {
4300     do_cutgroup = 0;
4301     no_final = 0;
4302    }
4303
4304    if ( ST.jump) {
4305     ST.lastparen = rex->lastparen;
4306     ST.lastcloseparen = rex->lastcloseparen;
4307     REGCP_SET(ST.cp);
4308    }
4309
4310    /* find start char of end of current word */
4311    {
4312     U32 chars; /* how many chars to skip */
4313     reg_trie_data * const trie
4314      = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4315
4316     assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4317        >=  ST.firstchars);
4318     chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4319        - ST.firstchars;
4320     uc = ST.firstpos;
4321
4322     if (ST.longfold) {
4323      /* the hard option - fold each char in turn and find
4324      * its folded length (which may be different */
4325      U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4326      STRLEN foldlen;
4327      STRLEN len;
4328      UV uvc;
4329      U8 *uscan;
4330
4331      while (chars) {
4332       if (utf8_target) {
4333        uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
4334              uniflags);
4335        uc += len;
4336       }
4337       else {
4338        uvc = *uc;
4339        uc++;
4340       }
4341       uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4342       uscan = foldbuf;
4343       while (foldlen) {
4344        if (!--chars)
4345         break;
4346        uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
4347            uniflags);
4348        uscan += len;
4349        foldlen -= len;
4350       }
4351      }
4352     }
4353     else {
4354      if (utf8_target)
4355       while (chars--)
4356        uc += UTF8SKIP(uc);
4357      else
4358       uc += chars;
4359     }
4360    }
4361
4362    scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4363        ? ST.jump[ST.nextword]
4364        : NEXT_OFF(ST.me));
4365
4366    DEBUG_EXECUTE_r({
4367     PerlIO_printf( Perl_debug_log,
4368      "%*s  %sTRIE matched word #%d, continuing%s\n",
4369      REPORT_CODE_OFF+depth*2, "",
4370      PL_colors[4],
4371      ST.nextword,
4372      PL_colors[5]
4373      );
4374    });
4375
4376    if (ST.accepted > 1 || has_cutgroup) {
4377     PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4378     assert(0); /* NOTREACHED */
4379    }
4380    /* only one choice left - just continue */
4381    DEBUG_EXECUTE_r({
4382     AV *const trie_words
4383      = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4384     SV ** const tmp = av_fetch( trie_words,
4385      ST.nextword-1, 0 );
4386     SV *sv= tmp ? sv_newmortal() : NULL;
4387
4388     PerlIO_printf( Perl_debug_log,
4389      "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
4390      REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4391      ST.nextword,
4392      tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4393        PL_colors[0], PL_colors[1],
4394        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4395       )
4396      : "not compiled under -Dr",
4397      PL_colors[5] );
4398    });
4399
4400    locinput = (char*)uc;
4401    continue; /* execute rest of RE */
4402    assert(0); /* NOTREACHED */
4403   }
4404 #undef  ST
4405
4406   case EXACT: {            /*  /abc/        */
4407    char *s = STRING(scan);
4408    ln = STR_LEN(scan);
4409    if (utf8_target != is_utf8_pat) {
4410     /* The target and the pattern have differing utf8ness. */
4411     char *l = locinput;
4412     const char * const e = s + ln;
4413
4414     if (utf8_target) {
4415      /* The target is utf8, the pattern is not utf8.
4416      * Above-Latin1 code points can't match the pattern;
4417      * invariants match exactly, and the other Latin1 ones need
4418      * to be downgraded to a single byte in order to do the
4419      * comparison.  (If we could be confident that the target
4420      * is not malformed, this could be refactored to have fewer
4421      * tests by just assuming that if the first bytes match, it
4422      * is an invariant, but there are tests in the test suite
4423      * dealing with (??{...}) which violate this) */
4424      while (s < e) {
4425       if (l >= reginfo->strend
4426        || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4427       {
4428        sayNO;
4429       }
4430       if (UTF8_IS_INVARIANT(*(U8*)l)) {
4431        if (*l != *s) {
4432         sayNO;
4433        }
4434        l++;
4435       }
4436       else {
4437        if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4438        {
4439         sayNO;
4440        }
4441        l += 2;
4442       }
4443       s++;
4444      }
4445     }
4446     else {
4447      /* The target is not utf8, the pattern is utf8. */
4448      while (s < e) {
4449       if (l >= reginfo->strend
4450        || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4451       {
4452        sayNO;
4453       }
4454       if (UTF8_IS_INVARIANT(*(U8*)s)) {
4455        if (*s != *l) {
4456         sayNO;
4457        }
4458        s++;
4459       }
4460       else {
4461        if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4462        {
4463         sayNO;
4464        }
4465        s += 2;
4466       }
4467       l++;
4468      }
4469     }
4470     locinput = l;
4471    }
4472    else {
4473     /* The target and the pattern have the same utf8ness. */
4474     /* Inline the first character, for speed. */
4475     if (reginfo->strend - locinput < ln
4476      || UCHARAT(s) != nextchr
4477      || (ln > 1 && memNE(s, locinput, ln)))
4478     {
4479      sayNO;
4480     }
4481     locinput += ln;
4482    }
4483    break;
4484    }
4485
4486   case EXACTFL: {          /*  /abc/il      */
4487    re_fold_t folder;
4488    const U8 * fold_array;
4489    const char * s;
4490    U32 fold_utf8_flags;
4491
4492    folder = foldEQ_locale;
4493    fold_array = PL_fold_locale;
4494    fold_utf8_flags = FOLDEQ_LOCALE;
4495    goto do_exactf;
4496
4497   case EXACTFU_SS:         /*  /\x{df}/iu   */
4498   case EXACTFU:            /*  /abc/iu      */
4499    folder = foldEQ_latin1;
4500    fold_array = PL_fold_latin1;
4501    fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4502    goto do_exactf;
4503
4504   case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
4505         patterns */
4506    assert(! is_utf8_pat);
4507    /* FALL THROUGH */
4508   case EXACTFA:            /*  /abc/iaa     */
4509    folder = foldEQ_latin1;
4510    fold_array = PL_fold_latin1;
4511    fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4512    goto do_exactf;
4513
4514   case EXACTF:             /*  /abc/i    This node only generated for
4515            non-utf8 patterns */
4516    assert(! is_utf8_pat);
4517    folder = foldEQ;
4518    fold_array = PL_fold;
4519    fold_utf8_flags = 0;
4520
4521   do_exactf:
4522    s = STRING(scan);
4523    ln = STR_LEN(scan);
4524
4525    if (utf8_target
4526     || is_utf8_pat
4527     || state_num == EXACTFU_SS
4528     || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4529    {
4530    /* Either target or the pattern are utf8, or has the issue where
4531    * the fold lengths may differ. */
4532     const char * const l = locinput;
4533     char *e = reginfo->strend;
4534
4535     if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
4536           l, &e, 0,  utf8_target, fold_utf8_flags))
4537     {
4538      sayNO;
4539     }
4540     locinput = e;
4541     break;
4542    }
4543
4544    /* Neither the target nor the pattern are utf8 */
4545    if (UCHARAT(s) != nextchr
4546     && !NEXTCHR_IS_EOS
4547     && UCHARAT(s) != fold_array[nextchr])
4548    {
4549     sayNO;
4550    }
4551    if (reginfo->strend - locinput < ln)
4552     sayNO;
4553    if (ln > 1 && ! folder(s, locinput, ln))
4554     sayNO;
4555    locinput += ln;
4556    break;
4557   }
4558
4559   /* XXX Could improve efficiency by separating these all out using a
4560   * macro or in-line function.  At that point regcomp.c would no longer
4561   * have to set the FLAGS fields of these */
4562   case BOUNDL:  /*  /\b/l  */
4563   case NBOUNDL: /*  /\B/l  */
4564   case BOUND:   /*  /\b/   */
4565   case BOUNDU:  /*  /\b/u  */
4566   case BOUNDA:  /*  /\b/a  */
4567   case NBOUND:  /*  /\B/   */
4568   case NBOUNDU: /*  /\B/u  */
4569   case NBOUNDA: /*  /\B/a  */
4570    /* was last char in word? */
4571    if (utf8_target
4572     && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4573     && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4574    {
4575     if (locinput == reginfo->strbeg)
4576      ln = '\n';
4577     else {
4578      const U8 * const r =
4579        reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4580
4581      ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
4582                 0, uniflags);
4583     }
4584     if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4585      ln = isWORDCHAR_uni(ln);
4586      if (NEXTCHR_IS_EOS)
4587       n = 0;
4588      else {
4589       LOAD_UTF8_CHARCLASS_ALNUM();
4590       n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4591                 utf8_target);
4592      }
4593     }
4594     else {
4595      ln = isWORDCHAR_LC_uvchr(ln);
4596      n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4597     }
4598    }
4599    else {
4600
4601     /* Here the string isn't utf8, or is utf8 and only ascii
4602     * characters are to match \w.  In the latter case looking at
4603     * the byte just prior to the current one may be just the final
4604     * byte of a multi-byte character.  This is ok.  There are two
4605     * cases:
4606     * 1) it is a single byte character, and then the test is doing
4607     * just what it's supposed to.
4608     * 2) it is a multi-byte character, in which case the final
4609     * byte is never mistakable for ASCII, and so the test
4610     * will say it is not a word character, which is the
4611     * correct answer. */
4612     ln = (locinput != reginfo->strbeg) ?
4613      UCHARAT(locinput - 1) : '\n';
4614     switch (FLAGS(scan)) {
4615      case REGEX_UNICODE_CHARSET:
4616       ln = isWORDCHAR_L1(ln);
4617       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4618       break;
4619      case REGEX_LOCALE_CHARSET:
4620       ln = isWORDCHAR_LC(ln);
4621       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4622       break;
4623      case REGEX_DEPENDS_CHARSET:
4624       ln = isWORDCHAR(ln);
4625       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4626       break;
4627      case REGEX_ASCII_RESTRICTED_CHARSET:
4628      case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4629       ln = isWORDCHAR_A(ln);
4630       n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4631       break;
4632      default:
4633       Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4634       break;
4635     }
4636    }
4637    /* Note requires that all BOUNDs be lower than all NBOUNDs in
4638    * regcomp.sym */
4639    if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4640      sayNO;
4641    break;
4642
4643   case ANYOF:  /*  /[abc]/       */
4644    if (NEXTCHR_IS_EOS)
4645     sayNO;
4646    if (utf8_target) {
4647     if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
4648                 utf8_target))
4649      sayNO;
4650     locinput += UTF8SKIP(locinput);
4651    }
4652    else {
4653     if (!REGINCLASS(rex, scan, (U8*)locinput))
4654      sayNO;
4655     locinput++;
4656    }
4657    break;
4658
4659   /* The argument (FLAGS) to all the POSIX node types is the class number
4660   * */
4661
4662   case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
4663    to_complement = 1;
4664    /* FALLTHROUGH */
4665
4666   case POSIXL:    /* \w or [:punct:] etc. under /l */
4667    if (NEXTCHR_IS_EOS)
4668     sayNO;
4669
4670    /* Use isFOO_lc() for characters within Latin1.  (Note that
4671    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4672    * wouldn't be invariant) */
4673    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4674     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4675      sayNO;
4676     }
4677    }
4678    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4679     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4680           (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4681                *(locinput + 1))))))
4682     {
4683      sayNO;
4684     }
4685    }
4686    else { /* Here, must be an above Latin-1 code point */
4687     goto utf8_posix_not_eos;
4688    }
4689
4690    /* Here, must be utf8 */
4691    locinput += UTF8SKIP(locinput);
4692    break;
4693
4694   case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
4695    to_complement = 1;
4696    /* FALLTHROUGH */
4697
4698   case POSIXD:    /* \w or [:punct:] etc. under /d */
4699    if (utf8_target) {
4700     goto utf8_posix;
4701    }
4702    goto posixa;
4703
4704   case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
4705
4706    if (NEXTCHR_IS_EOS) {
4707     sayNO;
4708    }
4709
4710    /* All UTF-8 variants match */
4711    if (! UTF8_IS_INVARIANT(nextchr)) {
4712     goto increment_locinput;
4713    }
4714
4715    to_complement = 1;
4716    /* FALLTHROUGH */
4717
4718   case POSIXA:    /* \w or [:punct:] etc. under /a */
4719
4720   posixa:
4721    /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4722    * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4723    * character is a single byte */
4724
4725    if (NEXTCHR_IS_EOS
4726     || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4727                FLAGS(scan)))))
4728    {
4729     sayNO;
4730    }
4731
4732    /* Here we are either not in utf8, or we matched a utf8-invariant,
4733    * so the next char is the next byte */
4734    locinput++;
4735    break;
4736
4737   case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
4738    to_complement = 1;
4739    /* FALLTHROUGH */
4740
4741   case POSIXU:    /* \w or [:punct:] etc. under /u */
4742   utf8_posix:
4743    if (NEXTCHR_IS_EOS) {
4744     sayNO;
4745    }
4746   utf8_posix_not_eos:
4747
4748    /* Use _generic_isCC() for characters within Latin1.  (Note that
4749    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4750    * wouldn't be invariant) */
4751    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4752     if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4753               FLAGS(scan)))))
4754     {
4755      sayNO;
4756     }
4757     locinput++;
4758    }
4759    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4760     if (! (to_complement
4761      ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4762                *(locinput + 1)),
4763            FLAGS(scan)))))
4764     {
4765      sayNO;
4766     }
4767     locinput += 2;
4768    }
4769    else {  /* Handle above Latin-1 code points */
4770     classnum = (_char_class_number) FLAGS(scan);
4771     if (classnum < _FIRST_NON_SWASH_CC) {
4772
4773      /* Here, uses a swash to find such code points.  Load if if
4774      * not done already */
4775      if (! PL_utf8_swash_ptrs[classnum]) {
4776       U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4777       PL_utf8_swash_ptrs[classnum]
4778         = _core_swash_init("utf8",
4779           "",
4780           &PL_sv_undef, 1, 0,
4781           PL_XPosix_ptrs[classnum], &flags);
4782      }
4783      if (! (to_complement
4784       ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4785            (U8 *) locinput, TRUE))))
4786      {
4787       sayNO;
4788      }
4789     }
4790     else {  /* Here, uses macros to find above Latin-1 code points */
4791      switch (classnum) {
4792       case _CC_ENUM_SPACE:    /* XXX would require separate
4793             code if we revert the change
4794             of \v matching this */
4795       case _CC_ENUM_PSXSPC:
4796        if (! (to_complement
4797           ^ cBOOL(is_XPERLSPACE_high(locinput))))
4798        {
4799         sayNO;
4800        }
4801        break;
4802       case _CC_ENUM_BLANK:
4803        if (! (to_complement
4804            ^ cBOOL(is_HORIZWS_high(locinput))))
4805        {
4806         sayNO;
4807        }
4808        break;
4809       case _CC_ENUM_XDIGIT:
4810        if (! (to_complement
4811            ^ cBOOL(is_XDIGIT_high(locinput))))
4812        {
4813         sayNO;
4814        }
4815        break;
4816       case _CC_ENUM_VERTSPACE:
4817        if (! (to_complement
4818            ^ cBOOL(is_VERTWS_high(locinput))))
4819        {
4820         sayNO;
4821        }
4822        break;
4823       default:    /* The rest, e.g. [:cntrl:], can't match
4824          above Latin1 */
4825        if (! to_complement) {
4826         sayNO;
4827        }
4828        break;
4829      }
4830     }
4831     locinput += UTF8SKIP(locinput);
4832    }
4833    break;
4834
4835   case CLUMP: /* Match \X: logical Unicode character.  This is defined as
4836      a Unicode extended Grapheme Cluster */
4837    /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
4838    extended Grapheme Cluster is:
4839
4840    CR LF
4841    | Prepend* Begin Extend*
4842    | .
4843
4844    Begin is:           ( Special_Begin | ! Control )
4845    Special_Begin is:   ( Regional-Indicator+ | Hangul-syllable )
4846    Extend is:          ( Grapheme_Extend | Spacing_Mark )
4847    Control is:         [ GCB_Control | CR | LF ]
4848    Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4849
4850    If we create a 'Regular_Begin' = Begin - Special_Begin, then
4851    we can rewrite
4852
4853     Begin is ( Regular_Begin + Special Begin )
4854
4855    It turns out that 98.4% of all Unicode code points match
4856    Regular_Begin.  Doing it this way eliminates a table match in
4857    the previous implementation for almost all Unicode code points.
4858
4859    There is a subtlety with Prepend* which showed up in testing.
4860    Note that the Begin, and only the Begin is required in:
4861     | Prepend* Begin Extend*
4862    Also, Begin contains '! Control'.  A Prepend must be a
4863    '!  Control', which means it must also be a Begin.  What it
4864    comes down to is that if we match Prepend* and then find no
4865    suitable Begin afterwards, that if we backtrack the last
4866    Prepend, that one will be a suitable Begin.
4867    */
4868
4869    if (NEXTCHR_IS_EOS)
4870     sayNO;
4871    if  (! utf8_target) {
4872
4873     /* Match either CR LF  or '.', as all the other possibilities
4874     * require utf8 */
4875     locinput++;     /* Match the . or CR */
4876     if (nextchr == '\r' /* And if it was CR, and the next is LF,
4877          match the LF */
4878      && locinput < reginfo->strend
4879      && UCHARAT(locinput) == '\n')
4880     {
4881      locinput++;
4882     }
4883    }
4884    else {
4885
4886     /* Utf8: See if is ( CR LF ); already know that locinput <
4887     * reginfo->strend, so locinput+1 is in bounds */
4888     if ( nextchr == '\r' && locinput+1 < reginfo->strend
4889      && UCHARAT(locinput + 1) == '\n')
4890     {
4891      locinput += 2;
4892     }
4893     else {
4894      STRLEN len;
4895
4896      /* In case have to backtrack to beginning, then match '.' */
4897      char *starting = locinput;
4898
4899      /* In case have to backtrack the last prepend */
4900      char *previous_prepend = NULL;
4901
4902      LOAD_UTF8_CHARCLASS_GCB();
4903
4904      /* Match (prepend)*   */
4905      while (locinput < reginfo->strend
4906       && (len = is_GCB_Prepend_utf8(locinput)))
4907      {
4908       previous_prepend = locinput;
4909       locinput += len;
4910      }
4911
4912      /* As noted above, if we matched a prepend character, but
4913      * the next thing won't match, back off the last prepend we
4914      * matched, as it is guaranteed to match the begin */
4915      if (previous_prepend
4916       && (locinput >=  reginfo->strend
4917        || (! swash_fetch(PL_utf8_X_regular_begin,
4918            (U8*)locinput, utf8_target)
4919         && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4920       )
4921      {
4922       locinput = previous_prepend;
4923      }
4924
4925      /* Note that here we know reginfo->strend > locinput, as we
4926      * tested that upon input to this switch case, and if we
4927      * moved locinput forward, we tested the result just above
4928      * and it either passed, or we backed off so that it will
4929      * now pass */
4930      if (swash_fetch(PL_utf8_X_regular_begin,
4931          (U8*)locinput, utf8_target)) {
4932       locinput += UTF8SKIP(locinput);
4933      }
4934      else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4935
4936       /* Here did not match the required 'Begin' in the
4937       * second term.  So just match the very first
4938       * character, the '.' of the final term of the regex */
4939       locinput = starting + UTF8SKIP(starting);
4940       goto exit_utf8;
4941      } else {
4942
4943       /* Here is a special begin.  It can be composed of
4944       * several individual characters.  One possibility is
4945       * RI+ */
4946       if ((len = is_GCB_RI_utf8(locinput))) {
4947        locinput += len;
4948        while (locinput < reginfo->strend
4949         && (len = is_GCB_RI_utf8(locinput)))
4950        {
4951         locinput += len;
4952        }
4953       } else if ((len = is_GCB_T_utf8(locinput))) {
4954        /* Another possibility is T+ */
4955        locinput += len;
4956        while (locinput < reginfo->strend
4957         && (len = is_GCB_T_utf8(locinput)))
4958        {
4959         locinput += len;
4960        }
4961       } else {
4962
4963        /* Here, neither RI+ nor T+; must be some other
4964        * Hangul.  That means it is one of the others: L,
4965        * LV, LVT or V, and matches:
4966        * L* (L | LVT T* | V * V* T* | LV  V* T*) */
4967
4968        /* Match L*           */
4969        while (locinput < reginfo->strend
4970         && (len = is_GCB_L_utf8(locinput)))
4971        {
4972         locinput += len;
4973        }
4974
4975        /* Here, have exhausted L*.  If the next character
4976        * is not an LV, LVT nor V, it means we had to have
4977        * at least one L, so matches L+ in the original
4978        * equation, we have a complete hangul syllable.
4979        * Are done. */
4980
4981        if (locinput < reginfo->strend
4982         && is_GCB_LV_LVT_V_utf8(locinput))
4983        {
4984         /* Otherwise keep going.  Must be LV, LVT or V.
4985         * See if LVT, by first ruling out V, then LV */
4986         if (! is_GCB_V_utf8(locinput)
4987           /* All but every TCount one is LV */
4988          && (valid_utf8_to_uvchr((U8 *) locinput,
4989                   NULL)
4990                   - SBASE)
4991           % TCount != 0)
4992         {
4993          locinput += UTF8SKIP(locinput);
4994         } else {
4995
4996          /* Must be  V or LV.  Take it, then match
4997          * V*     */
4998          locinput += UTF8SKIP(locinput);
4999          while (locinput < reginfo->strend
5000           && (len = is_GCB_V_utf8(locinput)))
5001          {
5002           locinput += len;
5003          }
5004         }
5005
5006         /* And any of LV, LVT, or V can be followed
5007         * by T*            */
5008         while (locinput < reginfo->strend
5009          && (len = is_GCB_T_utf8(locinput)))
5010         {
5011          locinput += len;
5012         }
5013        }
5014       }
5015      }
5016
5017      /* Match any extender */
5018      while (locinput < reginfo->strend
5019        && swash_fetch(PL_utf8_X_extend,
5020            (U8*)locinput, utf8_target))
5021      {
5022       locinput += UTF8SKIP(locinput);
5023      }
5024     }
5025    exit_utf8:
5026     if (locinput > reginfo->strend) sayNO;
5027    }
5028    break;
5029
5030   case NREFFL:  /*  /\g{name}/il  */
5031   {   /* The capture buffer cases.  The ones beginning with N for the
5032    named buffers just convert to the equivalent numbered and
5033    pretend they were called as the corresponding numbered buffer
5034    op.  */
5035    /* don't initialize these in the declaration, it makes C++
5036    unhappy */
5037    const char *s;
5038    char type;
5039    re_fold_t folder;
5040    const U8 *fold_array;
5041    UV utf8_fold_flags;
5042
5043    folder = foldEQ_locale;
5044    fold_array = PL_fold_locale;
5045    type = REFFL;
5046    utf8_fold_flags = FOLDEQ_LOCALE;
5047    goto do_nref;
5048
5049   case NREFFA:  /*  /\g{name}/iaa  */
5050    folder = foldEQ_latin1;
5051    fold_array = PL_fold_latin1;
5052    type = REFFA;
5053    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5054    goto do_nref;
5055
5056   case NREFFU:  /*  /\g{name}/iu  */
5057    folder = foldEQ_latin1;
5058    fold_array = PL_fold_latin1;
5059    type = REFFU;
5060    utf8_fold_flags = 0;
5061    goto do_nref;
5062
5063   case NREFF:  /*  /\g{name}/i  */
5064    folder = foldEQ;
5065    fold_array = PL_fold;
5066    type = REFF;
5067    utf8_fold_flags = 0;
5068    goto do_nref;
5069
5070   case NREF:  /*  /\g{name}/   */
5071    type = REF;
5072    folder = NULL;
5073    fold_array = NULL;
5074    utf8_fold_flags = 0;
5075   do_nref:
5076
5077    /* For the named back references, find the corresponding buffer
5078    * number */
5079    n = reg_check_named_buff_matched(rex,scan);
5080
5081    if ( ! n ) {
5082     sayNO;
5083    }
5084    goto do_nref_ref_common;
5085
5086   case REFFL:  /*  /\1/il  */
5087    folder = foldEQ_locale;
5088    fold_array = PL_fold_locale;
5089    utf8_fold_flags = FOLDEQ_LOCALE;
5090    goto do_ref;
5091
5092   case REFFA:  /*  /\1/iaa  */
5093    folder = foldEQ_latin1;
5094    fold_array = PL_fold_latin1;
5095    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5096    goto do_ref;
5097
5098   case REFFU:  /*  /\1/iu  */
5099    folder = foldEQ_latin1;
5100    fold_array = PL_fold_latin1;
5101    utf8_fold_flags = 0;
5102    goto do_ref;
5103
5104   case REFF:  /*  /\1/i  */
5105    folder = foldEQ;
5106    fold_array = PL_fold;
5107    utf8_fold_flags = 0;
5108    goto do_ref;
5109
5110   case REF:  /*  /\1/    */
5111    folder = NULL;
5112    fold_array = NULL;
5113    utf8_fold_flags = 0;
5114
5115   do_ref:
5116    type = OP(scan);
5117    n = ARG(scan);  /* which paren pair */
5118
5119   do_nref_ref_common:
5120    ln = rex->offs[n].start;
5121    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5122    if (rex->lastparen < n || ln == -1)
5123     sayNO;   /* Do not match unless seen CLOSEn. */
5124    if (ln == rex->offs[n].end)
5125     break;
5126
5127    s = reginfo->strbeg + ln;
5128    if (type != REF /* REF can do byte comparison */
5129     && (utf8_target || type == REFFU || type == REFFL))
5130    {
5131     char * limit = reginfo->strend;
5132
5133     /* This call case insensitively compares the entire buffer
5134      * at s, with the current input starting at locinput, but
5135      * not going off the end given by reginfo->strend, and
5136      * returns in <limit> upon success, how much of the
5137      * current input was matched */
5138     if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
5139          locinput, &limit, 0, utf8_target, utf8_fold_flags))
5140     {
5141      sayNO;
5142     }
5143     locinput = limit;
5144     break;
5145    }
5146
5147    /* Not utf8:  Inline the first character, for speed. */
5148    if (!NEXTCHR_IS_EOS &&
5149     UCHARAT(s) != nextchr &&
5150     (type == REF ||
5151     UCHARAT(s) != fold_array[nextchr]))
5152     sayNO;
5153    ln = rex->offs[n].end - ln;
5154    if (locinput + ln > reginfo->strend)
5155     sayNO;
5156    if (ln > 1 && (type == REF
5157       ? memNE(s, locinput, ln)
5158       : ! folder(s, locinput, ln)))
5159     sayNO;
5160    locinput += ln;
5161    break;
5162   }
5163
5164   case NOTHING: /* null op; e.g. the 'nothing' following
5165      * the '*' in m{(a+|b)*}' */
5166    break;
5167   case TAIL: /* placeholder while compiling (A|B|C) */
5168    break;
5169
5170   case BACK: /* ??? doesn't appear to be used ??? */
5171    break;
5172
5173 #undef  ST
5174 #define ST st->u.eval
5175   {
5176    SV *ret;
5177    REGEXP *re_sv;
5178    regexp *re;
5179    regexp_internal *rei;
5180    regnode *startpoint;
5181
5182   case GOSTART: /*  (?R)  */
5183   case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
5184    if (cur_eval && cur_eval->locinput==locinput) {
5185     if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
5186      Perl_croak(aTHX_ "Infinite recursion in regex");
5187     if ( ++nochange_depth > max_nochange_depth )
5188      Perl_croak(aTHX_
5189       "Pattern subroutine nesting without pos change"
5190       " exceeded limit in regex");
5191    } else {
5192     nochange_depth = 0;
5193    }
5194    re_sv = rex_sv;
5195    re = rex;
5196    rei = rexi;
5197    if (OP(scan)==GOSUB) {
5198     startpoint = scan + ARG2L(scan);
5199     ST.close_paren = ARG(scan);
5200    } else {
5201     startpoint = rei->program+1;
5202     ST.close_paren = 0;
5203    }
5204
5205    /* Save all the positions seen so far. */
5206    ST.cp = regcppush(rex, 0, maxopenparen);
5207    REGCP_SET(ST.lastcp);
5208
5209    /* and then jump to the code we share with EVAL */
5210    goto eval_recurse_doit;
5211
5212    assert(0); /* NOTREACHED */
5213
5214   case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
5215    if (cur_eval && cur_eval->locinput==locinput) {
5216     if ( ++nochange_depth > max_nochange_depth )
5217      Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
5218    } else {
5219     nochange_depth = 0;
5220    }
5221    {
5222     /* execute the code in the {...} */
5223
5224     dSP;
5225     IV before;
5226     OP * const oop = PL_op;
5227     COP * const ocurcop = PL_curcop;
5228     OP *nop;
5229     CV *newcv;
5230
5231     /* save *all* paren positions */
5232     regcppush(rex, 0, maxopenparen);
5233     REGCP_SET(runops_cp);
5234
5235     if (!caller_cv)
5236      caller_cv = find_runcv(NULL);
5237
5238     n = ARG(scan);
5239
5240     if (rexi->data->what[n] == 'r') { /* code from an external qr */
5241      newcv = (ReANY(
5242             (REGEXP*)(rexi->data->data[n])
5243            ))->qr_anoncv
5244           ;
5245      nop = (OP*)rexi->data->data[n+1];
5246     }
5247     else if (rexi->data->what[n] == 'l') { /* literal code */
5248      newcv = caller_cv;
5249      nop = (OP*)rexi->data->data[n];
5250      assert(CvDEPTH(newcv));
5251     }
5252     else {
5253      /* literal with own CV */
5254      assert(rexi->data->what[n] == 'L');
5255      newcv = rex->qr_anoncv;
5256      nop = (OP*)rexi->data->data[n];
5257     }
5258
5259     /* normally if we're about to execute code from the same
5260     * CV that we used previously, we just use the existing
5261     * CX stack entry. However, its possible that in the
5262     * meantime we may have backtracked, popped from the save
5263     * stack, and undone the SAVECOMPPAD(s) associated with
5264     * PUSH_MULTICALL; in which case PL_comppad no longer
5265     * points to newcv's pad. */
5266     if (newcv != last_pushed_cv || PL_comppad != last_pad)
5267     {
5268      U8 flags = (CXp_SUB_RE |
5269         ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
5270      if (last_pushed_cv) {
5271       CHANGE_MULTICALL_FLAGS(newcv, flags);
5272      }
5273      else {
5274       PUSH_MULTICALL_FLAGS(newcv, flags);
5275      }
5276      last_pushed_cv = newcv;
5277     }
5278     else {
5279      /* these assignments are just to silence compiler
5280      * warnings */
5281      multicall_cop = NULL;
5282      newsp = NULL;
5283     }
5284     last_pad = PL_comppad;
5285
5286     /* the initial nextstate you would normally execute
5287     * at the start of an eval (which would cause error
5288     * messages to come from the eval), may be optimised
5289     * away from the execution path in the regex code blocks;
5290     * so manually set PL_curcop to it initially */
5291     {
5292      OP *o = cUNOPx(nop)->op_first;
5293      assert(o->op_type == OP_NULL);
5294      if (o->op_targ == OP_SCOPE) {
5295       o = cUNOPo->op_first;
5296      }
5297      else {
5298       assert(o->op_targ == OP_LEAVE);
5299       o = cUNOPo->op_first;
5300       assert(o->op_type == OP_ENTER);
5301       o = o->op_sibling;
5302      }
5303
5304      if (o->op_type != OP_STUB) {
5305       assert(    o->op_type == OP_NEXTSTATE
5306         || o->op_type == OP_DBSTATE
5307         || (o->op_type == OP_NULL
5308          &&  (  o->op_targ == OP_NEXTSTATE
5309           || o->op_targ == OP_DBSTATE
5310           )
5311          )
5312       );
5313       PL_curcop = (COP*)o;
5314      }
5315     }
5316     nop = nop->op_next;
5317
5318     DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
5319      "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5320
5321     rex->offs[0].end = locinput - reginfo->strbeg;
5322     if (reginfo->info_aux_eval->pos_magic)
5323      MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5324         reginfo->sv, reginfo->strbeg,
5325         locinput - reginfo->strbeg);
5326
5327     if (sv_yes_mark) {
5328      SV *sv_mrk = get_sv("REGMARK", 1);
5329      sv_setsv(sv_mrk, sv_yes_mark);
5330     }
5331
5332     /* we don't use MULTICALL here as we want to call the
5333     * first op of the block of interest, rather than the
5334     * first op of the sub */
5335     before = (IV)(SP-PL_stack_base);
5336     PL_op = nop;
5337     CALLRUNOPS(aTHX);   /* Scalar context. */
5338     SPAGAIN;
5339     if ((IV)(SP-PL_stack_base) == before)
5340      ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
5341     else {
5342      ret = POPs;
5343      PUTBACK;
5344     }
5345
5346     /* before restoring everything, evaluate the returned
5347     * value, so that 'uninit' warnings don't use the wrong
5348     * PL_op or pad. Also need to process any magic vars
5349     * (e.g. $1) *before* parentheses are restored */
5350
5351     PL_op = NULL;
5352
5353     re_sv = NULL;
5354     if (logical == 0)        /*   (?{})/   */
5355      sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5356     else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
5357      sw = cBOOL(SvTRUE(ret));
5358      logical = 0;
5359     }
5360     else {                   /*  /(??{})  */
5361      /*  if its overloaded, let the regex compiler handle
5362      *  it; otherwise extract regex, or stringify  */
5363      if (SvGMAGICAL(ret))
5364       ret = sv_mortalcopy(ret);
5365      if (!SvAMAGIC(ret)) {
5366       SV *sv = ret;
5367       if (SvROK(sv))
5368        sv = SvRV(sv);
5369       if (SvTYPE(sv) == SVt_REGEXP)
5370        re_sv = (REGEXP*) sv;
5371       else if (SvSMAGICAL(ret)) {
5372        MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
5373        if (mg)
5374         re_sv = (REGEXP *) mg->mg_obj;
5375       }
5376
5377       /* force any undef warnings here */
5378       if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
5379        ret = sv_mortalcopy(ret);
5380        (void) SvPV_force_nolen(ret);
5381       }
5382      }
5383
5384     }
5385
5386     /* *** Note that at this point we don't restore
5387     * PL_comppad, (or pop the CxSUB) on the assumption it may
5388     * be used again soon. This is safe as long as nothing
5389     * in the regexp code uses the pad ! */
5390     PL_op = oop;
5391     PL_curcop = ocurcop;
5392     S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5393     PL_curpm = PL_reg_curpm;
5394
5395     if (logical != 2)
5396      break;
5397    }
5398
5399     /* only /(??{})/  from now on */
5400     logical = 0;
5401     {
5402      /* extract RE object from returned value; compiling if
5403      * necessary */
5404
5405      if (re_sv) {
5406       re_sv = reg_temp_copy(NULL, re_sv);
5407      }
5408      else {
5409       U32 pm_flags = 0;
5410
5411       if (SvUTF8(ret) && IN_BYTES) {
5412        /* In use 'bytes': make a copy of the octet
5413        * sequence, but without the flag on */
5414        STRLEN len;
5415        const char *const p = SvPV(ret, len);
5416        ret = newSVpvn_flags(p, len, SVs_TEMP);
5417       }
5418       if (rex->intflags & PREGf_USE_RE_EVAL)
5419        pm_flags |= PMf_USE_RE_EVAL;
5420
5421       /* if we got here, it should be an engine which
5422       * supports compiling code blocks and stuff */
5423       assert(rex->engine && rex->engine->op_comp);
5424       assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5425       re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5426          rex->engine, NULL, NULL,
5427          /* copy /msix etc to inner pattern */
5428          scan->flags,
5429          pm_flags);
5430
5431       if (!(SvFLAGS(ret)
5432        & (SVs_TEMP | SVs_GMG | SVf_ROK))
5433       && (!SvPADTMP(ret) || SvREADONLY(ret))) {
5434        /* This isn't a first class regexp. Instead, it's
5435        caching a regexp onto an existing, Perl visible
5436        scalar.  */
5437        sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5438       }
5439      }
5440      SAVEFREESV(re_sv);
5441      re = ReANY(re_sv);
5442     }
5443     RXp_MATCH_COPIED_off(re);
5444     re->subbeg = rex->subbeg;
5445     re->sublen = rex->sublen;
5446     re->suboffset = rex->suboffset;
5447     re->subcoffset = rex->subcoffset;
5448     re->lastparen = 0;
5449     re->lastcloseparen = 0;
5450     rei = RXi_GET(re);
5451     DEBUG_EXECUTE_r(
5452      debug_start_match(re_sv, utf8_target, locinput,
5453          reginfo->strend, "Matching embedded");
5454     );
5455     startpoint = rei->program + 1;
5456      ST.close_paren = 0; /* only used for GOSUB */
5457     /* Save all the seen positions so far. */
5458     ST.cp = regcppush(rex, 0, maxopenparen);
5459     REGCP_SET(ST.lastcp);
5460     /* and set maxopenparen to 0, since we are starting a "fresh" match */
5461     maxopenparen = 0;
5462     /* run the pattern returned from (??{...}) */
5463
5464   eval_recurse_doit: /* Share code with GOSUB below this line
5465        * At this point we expect the stack context to be
5466        * set up correctly */
5467
5468     /* invalidate the S-L poscache. We're now executing a
5469     * different set of WHILEM ops (and their associated
5470     * indexes) against the same string, so the bits in the
5471     * cache are meaningless. Setting maxiter to zero forces
5472     * the cache to be invalidated and zeroed before reuse.
5473     * XXX This is too dramatic a measure. Ideally we should
5474     * save the old cache and restore when running the outer
5475     * pattern again */
5476     reginfo->poscache_maxiter = 0;
5477
5478     /* the new regexp might have a different is_utf8_pat than we do */
5479     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5480
5481     ST.prev_rex = rex_sv;
5482     ST.prev_curlyx = cur_curlyx;
5483     rex_sv = re_sv;
5484     SET_reg_curpm(rex_sv);
5485     rex = re;
5486     rexi = rei;
5487     cur_curlyx = NULL;
5488     ST.B = next;
5489     ST.prev_eval = cur_eval;
5490     cur_eval = st;
5491     /* now continue from first node in postoned RE */
5492     PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5493     assert(0); /* NOTREACHED */
5494   }
5495
5496   case EVAL_AB: /* cleanup after a successful (??{A})B */
5497    /* note: this is called twice; first after popping B, then A */
5498    rex_sv = ST.prev_rex;
5499    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5500    SET_reg_curpm(rex_sv);
5501    rex = ReANY(rex_sv);
5502    rexi = RXi_GET(rex);
5503    {
5504     /* preserve $^R across LEAVE's. See Bug 121070. */
5505     SV *save_sv= GvSV(PL_replgv);
5506     SvREFCNT_inc(save_sv);
5507     regcpblow(ST.cp); /* LEAVE in disguise */
5508     sv_setsv(GvSV(PL_replgv), save_sv);
5509     SvREFCNT_dec(save_sv);
5510    }
5511    cur_eval = ST.prev_eval;
5512    cur_curlyx = ST.prev_curlyx;
5513
5514    /* Invalidate cache. See "invalidate" comment above. */
5515    reginfo->poscache_maxiter = 0;
5516    if ( nochange_depth )
5517     nochange_depth--;
5518    sayYES;
5519
5520
5521   case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5522    /* note: this is called twice; first after popping B, then A */
5523    rex_sv = ST.prev_rex;
5524    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5525    SET_reg_curpm(rex_sv);
5526    rex = ReANY(rex_sv);
5527    rexi = RXi_GET(rex);
5528
5529    REGCP_UNWIND(ST.lastcp);
5530    regcppop(rex, &maxopenparen);
5531    cur_eval = ST.prev_eval;
5532    cur_curlyx = ST.prev_curlyx;
5533    /* Invalidate cache. See "invalidate" comment above. */
5534    reginfo->poscache_maxiter = 0;
5535    if ( nochange_depth )
5536     nochange_depth--;
5537    sayNO_SILENT;
5538 #undef ST
5539
5540   case OPEN: /*  (  */
5541    n = ARG(scan);  /* which paren pair */
5542    rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5543    if (n > maxopenparen)
5544     maxopenparen = n;
5545    DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5546     "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5547     PTR2UV(rex),
5548     PTR2UV(rex->offs),
5549     (UV)n,
5550     (IV)rex->offs[n].start_tmp,
5551     (UV)maxopenparen
5552    ));
5553    lastopen = n;
5554    break;
5555
5556 /* XXX really need to log other places start/end are set too */
5557 #define CLOSE_CAPTURE \
5558  rex->offs[n].start = rex->offs[n].start_tmp; \
5559  rex->offs[n].end = locinput - reginfo->strbeg; \
5560  DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5561   "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5562   PTR2UV(rex), \
5563   PTR2UV(rex->offs), \
5564   (UV)n, \
5565   (IV)rex->offs[n].start, \
5566   (IV)rex->offs[n].end \
5567  ))
5568
5569   case CLOSE:  /*  )  */
5570    n = ARG(scan);  /* which paren pair */
5571    CLOSE_CAPTURE;
5572    if (n > rex->lastparen)
5573     rex->lastparen = n;
5574    rex->lastcloseparen = n;
5575    if (cur_eval && cur_eval->u.eval.close_paren == n) {
5576     goto fake_end;
5577    }
5578    break;
5579
5580   case ACCEPT:  /*  (*ACCEPT)  */
5581    if (ARG(scan)){
5582     regnode *cursor;
5583     for (cursor=scan;
5584      cursor && OP(cursor)!=END;
5585      cursor=regnext(cursor))
5586     {
5587      if ( OP(cursor)==CLOSE ){
5588       n = ARG(cursor);
5589       if ( n <= lastopen ) {
5590        CLOSE_CAPTURE;
5591        if (n > rex->lastparen)
5592         rex->lastparen = n;
5593        rex->lastcloseparen = n;
5594        if ( n == ARG(scan) || (cur_eval &&
5595         cur_eval->u.eval.close_paren == n))
5596         break;
5597       }
5598      }
5599     }
5600    }
5601    goto fake_end;
5602    /*NOTREACHED*/
5603
5604   case GROUPP:  /*  (?(1))  */
5605    n = ARG(scan);  /* which paren pair */
5606    sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5607    break;
5608
5609   case NGROUPP:  /*  (?(<name>))  */
5610    /* reg_check_named_buff_matched returns 0 for no match */
5611    sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5612    break;
5613
5614   case INSUBP:   /*  (?(R))  */
5615    n = ARG(scan);
5616    sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5617    break;
5618
5619   case DEFINEP:  /*  (?(DEFINE))  */
5620    sw = 0;
5621    break;
5622
5623   case IFTHEN:   /*  (?(cond)A|B)  */
5624    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5625    if (sw)
5626     next = NEXTOPER(NEXTOPER(scan));
5627    else {
5628     next = scan + ARG(scan);
5629     if (OP(next) == IFTHEN) /* Fake one. */
5630      next = NEXTOPER(NEXTOPER(next));
5631    }
5632    break;
5633
5634   case LOGICAL:  /* modifier for EVAL and IFMATCH */
5635    logical = scan->flags;
5636    break;
5637
5638 /*******************************************************************
5639
5640 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5641 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5642 STAR/PLUS/CURLY/CURLYN are used instead.)
5643
5644 A*B is compiled as <CURLYX><A><WHILEM><B>
5645
5646 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5647 state, which contains the current count, initialised to -1. It also sets
5648 cur_curlyx to point to this state, with any previous value saved in the
5649 state block.
5650
5651 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5652 since the pattern may possibly match zero times (i.e. it's a while {} loop
5653 rather than a do {} while loop).
5654
5655 Each entry to WHILEM represents a successful match of A. The count in the
5656 CURLYX block is incremented, another WHILEM state is pushed, and execution
5657 passes to A or B depending on greediness and the current count.
5658
5659 For example, if matching against the string a1a2a3b (where the aN are
5660 substrings that match /A/), then the match progresses as follows: (the
5661 pushed states are interspersed with the bits of strings matched so far):
5662
5663  <CURLYX cnt=-1>
5664  <CURLYX cnt=0><WHILEM>
5665  <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5666  <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5667  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5668  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5669
5670 (Contrast this with something like CURLYM, which maintains only a single
5671 backtrack state:
5672
5673  <CURLYM cnt=0> a1
5674  a1 <CURLYM cnt=1> a2
5675  a1 a2 <CURLYM cnt=2> a3
5676  a1 a2 a3 <CURLYM cnt=3> b
5677 )
5678
5679 Each WHILEM state block marks a point to backtrack to upon partial failure
5680 of A or B, and also contains some minor state data related to that
5681 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
5682 overall state, such as the count, and pointers to the A and B ops.
5683
5684 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5685 must always point to the *current* CURLYX block, the rules are:
5686
5687 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5688 and set cur_curlyx to point the new block.
5689
5690 When popping the CURLYX block after a successful or unsuccessful match,
5691 restore the previous cur_curlyx.
5692
5693 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5694 to the outer one saved in the CURLYX block.
5695
5696 When popping the WHILEM block after a successful or unsuccessful B match,
5697 restore the previous cur_curlyx.
5698
5699 Here's an example for the pattern (AI* BI)*BO
5700 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5701
5702 cur_
5703 curlyx backtrack stack
5704 ------ ---------------
5705 NULL
5706 CO     <CO prev=NULL> <WO>
5707 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5708 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5709 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5710
5711 At this point the pattern succeeds, and we work back down the stack to
5712 clean up, restoring as we go:
5713
5714 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5715 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5716 CO     <CO prev=NULL> <WO>
5717 NULL
5718
5719 *******************************************************************/
5720
5721 #define ST st->u.curlyx
5722
5723   case CURLYX:    /* start of /A*B/  (for complex A) */
5724   {
5725    /* No need to save/restore up to this paren */
5726    I32 parenfloor = scan->flags;
5727
5728    assert(next); /* keep Coverity happy */
5729    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5730     next += ARG(next);
5731
5732    /* XXXX Probably it is better to teach regpush to support
5733    parenfloor > maxopenparen ... */
5734    if (parenfloor > (I32)rex->lastparen)
5735     parenfloor = rex->lastparen; /* Pessimization... */
5736
5737    ST.prev_curlyx= cur_curlyx;
5738    cur_curlyx = st;
5739    ST.cp = PL_savestack_ix;
5740
5741    /* these fields contain the state of the current curly.
5742    * they are accessed by subsequent WHILEMs */
5743    ST.parenfloor = parenfloor;
5744    ST.me = scan;
5745    ST.B = next;
5746    ST.minmod = minmod;
5747    minmod = 0;
5748    ST.count = -1; /* this will be updated by WHILEM */
5749    ST.lastloc = NULL;  /* this will be updated by WHILEM */
5750
5751    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5752    assert(0); /* NOTREACHED */
5753   }
5754
5755   case CURLYX_end: /* just finished matching all of A*B */
5756    cur_curlyx = ST.prev_curlyx;
5757    sayYES;
5758    assert(0); /* NOTREACHED */
5759
5760   case CURLYX_end_fail: /* just failed to match all of A*B */
5761    regcpblow(ST.cp);
5762    cur_curlyx = ST.prev_curlyx;
5763    sayNO;
5764    assert(0); /* NOTREACHED */
5765
5766
5767 #undef ST
5768 #define ST st->u.whilem
5769
5770   case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
5771   {
5772    /* see the discussion above about CURLYX/WHILEM */
5773    I32 n;
5774    int min = ARG1(cur_curlyx->u.curlyx.me);
5775    int max = ARG2(cur_curlyx->u.curlyx.me);
5776    regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5777
5778    assert(cur_curlyx); /* keep Coverity happy */
5779    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5780    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5781    ST.cache_offset = 0;
5782    ST.cache_mask = 0;
5783
5784
5785    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5786     "%*s  whilem: matched %ld out of %d..%d\n",
5787     REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5788    );
5789
5790    /* First just match a string of min A's. */
5791
5792    if (n < min) {
5793     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5794          maxopenparen);
5795     cur_curlyx->u.curlyx.lastloc = locinput;
5796     REGCP_SET(ST.lastcp);
5797
5798     PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5799     assert(0); /* NOTREACHED */
5800    }
5801
5802    /* If degenerate A matches "", assume A done. */
5803
5804    if (locinput == cur_curlyx->u.curlyx.lastloc) {
5805     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5806     "%*s  whilem: empty match detected, trying continuation...\n",
5807     REPORT_CODE_OFF+depth*2, "")
5808     );
5809     goto do_whilem_B_max;
5810    }
5811
5812    /* super-linear cache processing.
5813    *
5814    * The idea here is that for certain types of CURLYX/WHILEM -
5815    * principally those whose upper bound is infinity (and
5816    * excluding regexes that have things like \1 and other very
5817    * non-regular expresssiony things), then if a pattern like
5818    * /....A*.../ fails and we backtrack to the WHILEM, then we
5819    * make a note that this particular WHILEM op was at string
5820    * position 47 (say) when the rest of pattern failed. Then, if
5821    * we ever find ourselves back at that WHILEM, and at string
5822    * position 47 again, we can just fail immediately rather than
5823    * running the rest of the pattern again.
5824    *
5825    * This is very handy when patterns start to go
5826    * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5827    * with a combinatorial explosion of backtracking.
5828    *
5829    * The cache is implemented as a bit array, with one bit per
5830    * string byte position per WHILEM op (up to 16) - so its
5831    * between 0.25 and 2x the string size.
5832    *
5833    * To avoid allocating a poscache buffer every time, we do an
5834    * initially countdown; only after we have  executed a WHILEM
5835    * op (string-length x #WHILEMs) times do we allocate the
5836    * cache.
5837    *
5838    * The top 4 bits of scan->flags byte say how many different
5839    * relevant CURLLYX/WHILEM op pairs there are, while the
5840    * bottom 4-bits is the identifying index number of this
5841    * WHILEM.
5842    */
5843
5844    if (scan->flags) {
5845
5846     if (!reginfo->poscache_maxiter) {
5847      /* start the countdown: Postpone detection until we
5848      * know the match is not *that* much linear. */
5849      reginfo->poscache_maxiter
5850       =    (reginfo->strend - reginfo->strbeg + 1)
5851       * (scan->flags>>4);
5852      /* possible overflow for long strings and many CURLYX's */
5853      if (reginfo->poscache_maxiter < 0)
5854       reginfo->poscache_maxiter = I32_MAX;
5855      reginfo->poscache_iter = reginfo->poscache_maxiter;
5856     }
5857
5858     if (reginfo->poscache_iter-- == 0) {
5859      /* initialise cache */
5860      const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
5861      regmatch_info_aux *const aux = reginfo->info_aux;
5862      if (aux->poscache) {
5863       if ((SSize_t)reginfo->poscache_size < size) {
5864        Renew(aux->poscache, size, char);
5865        reginfo->poscache_size = size;
5866       }
5867       Zero(aux->poscache, size, char);
5868      }
5869      else {
5870       reginfo->poscache_size = size;
5871       Newxz(aux->poscache, size, char);
5872      }
5873      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5874  "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5875        PL_colors[4], PL_colors[5])
5876      );
5877     }
5878
5879     if (reginfo->poscache_iter < 0) {
5880      /* have we already failed at this position? */
5881      SSize_t offset, mask;
5882
5883      reginfo->poscache_iter = -1; /* stop eventual underflow */
5884      offset  = (scan->flags & 0xf) - 1
5885         +   (locinput - reginfo->strbeg)
5886         * (scan->flags>>4);
5887      mask    = 1 << (offset % 8);
5888      offset /= 8;
5889      if (reginfo->info_aux->poscache[offset] & mask) {
5890       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5891        "%*s  whilem: (cache) already tried at this position...\n",
5892        REPORT_CODE_OFF+depth*2, "")
5893       );
5894       sayNO; /* cache records failure */
5895      }
5896      ST.cache_offset = offset;
5897      ST.cache_mask   = mask;
5898     }
5899    }
5900
5901    /* Prefer B over A for minimal matching. */
5902
5903    if (cur_curlyx->u.curlyx.minmod) {
5904     ST.save_curlyx = cur_curlyx;
5905     cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5906     ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5907        maxopenparen);
5908     REGCP_SET(ST.lastcp);
5909     PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5910          locinput);
5911     assert(0); /* NOTREACHED */
5912    }
5913
5914    /* Prefer A over B for maximal matching. */
5915
5916    if (n < max) { /* More greed allowed? */
5917     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5918        maxopenparen);
5919     cur_curlyx->u.curlyx.lastloc = locinput;
5920     REGCP_SET(ST.lastcp);
5921     PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5922     assert(0); /* NOTREACHED */
5923    }
5924    goto do_whilem_B_max;
5925   }
5926   assert(0); /* NOTREACHED */
5927
5928   case WHILEM_B_min: /* just matched B in a minimal match */
5929   case WHILEM_B_max: /* just matched B in a maximal match */
5930    cur_curlyx = ST.save_curlyx;
5931    sayYES;
5932    assert(0); /* NOTREACHED */
5933
5934   case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5935    cur_curlyx = ST.save_curlyx;
5936    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5937    cur_curlyx->u.curlyx.count--;
5938    CACHEsayNO;
5939    assert(0); /* NOTREACHED */
5940
5941   case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5942    /* FALL THROUGH */
5943   case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5944    REGCP_UNWIND(ST.lastcp);
5945    regcppop(rex, &maxopenparen);
5946    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5947    cur_curlyx->u.curlyx.count--;
5948    CACHEsayNO;
5949    assert(0); /* NOTREACHED */
5950
5951   case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5952    REGCP_UNWIND(ST.lastcp);
5953    regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5954    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5955     "%*s  whilem: failed, trying continuation...\n",
5956     REPORT_CODE_OFF+depth*2, "")
5957    );
5958   do_whilem_B_max:
5959    if (cur_curlyx->u.curlyx.count >= REG_INFTY
5960     && ckWARN(WARN_REGEXP)
5961     && !reginfo->warned)
5962    {
5963     reginfo->warned = TRUE;
5964     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5965      "Complex regular subexpression recursion limit (%d) "
5966      "exceeded",
5967      REG_INFTY - 1);
5968    }
5969
5970    /* now try B */
5971    ST.save_curlyx = cur_curlyx;
5972    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5973    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5974         locinput);
5975    assert(0); /* NOTREACHED */
5976
5977   case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5978    cur_curlyx = ST.save_curlyx;
5979    REGCP_UNWIND(ST.lastcp);
5980    regcppop(rex, &maxopenparen);
5981
5982    if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5983     /* Maximum greed exceeded */
5984     if (cur_curlyx->u.curlyx.count >= REG_INFTY
5985      && ckWARN(WARN_REGEXP)
5986      && !reginfo->warned)
5987     {
5988      reginfo->warned = TRUE;
5989      Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5990       "Complex regular subexpression recursion "
5991       "limit (%d) exceeded",
5992       REG_INFTY - 1);
5993     }
5994     cur_curlyx->u.curlyx.count--;
5995     CACHEsayNO;
5996    }
5997
5998    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5999     "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6000    );
6001    /* Try grabbing another A and see if it helps. */
6002    cur_curlyx->u.curlyx.lastloc = locinput;
6003    ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6004        maxopenparen);
6005    REGCP_SET(ST.lastcp);
6006    PUSH_STATE_GOTO(WHILEM_A_min,
6007     /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6008     locinput);
6009    assert(0); /* NOTREACHED */
6010
6011 #undef  ST
6012 #define ST st->u.branch
6013
6014   case BRANCHJ:     /*  /(...|A|...)/ with long next pointer */
6015    next = scan + ARG(scan);
6016    if (next == scan)
6017     next = NULL;
6018    scan = NEXTOPER(scan);
6019    /* FALL THROUGH */
6020
6021   case BRANCH:     /*  /(...|A|...)/ */
6022    scan = NEXTOPER(scan); /* scan now points to inner node */
6023    ST.lastparen = rex->lastparen;
6024    ST.lastcloseparen = rex->lastcloseparen;
6025    ST.next_branch = next;
6026    REGCP_SET(ST.cp);
6027
6028    /* Now go into the branch */
6029    if (has_cutgroup) {
6030     PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
6031    } else {
6032     PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
6033    }
6034    assert(0); /* NOTREACHED */
6035
6036   case CUTGROUP:  /*  /(*THEN)/  */
6037    sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
6038     MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6039    PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
6040    assert(0); /* NOTREACHED */
6041
6042   case CUTGROUP_next_fail:
6043    do_cutgroup = 1;
6044    no_final = 1;
6045    if (st->u.mark.mark_name)
6046     sv_commit = st->u.mark.mark_name;
6047    sayNO;
6048    assert(0); /* NOTREACHED */
6049
6050   case BRANCH_next:
6051    sayYES;
6052    assert(0); /* NOTREACHED */
6053
6054   case BRANCH_next_fail: /* that branch failed; try the next, if any */
6055    if (do_cutgroup) {
6056     do_cutgroup = 0;
6057     no_final = 0;
6058    }
6059    REGCP_UNWIND(ST.cp);
6060    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6061    scan = ST.next_branch;
6062    /* no more branches? */
6063    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
6064     DEBUG_EXECUTE_r({
6065      PerlIO_printf( Perl_debug_log,
6066       "%*s  %sBRANCH failed...%s\n",
6067       REPORT_CODE_OFF+depth*2, "",
6068       PL_colors[4],
6069       PL_colors[5] );
6070     });
6071     sayNO_SILENT;
6072    }
6073    continue; /* execute next BRANCH[J] op */
6074    assert(0); /* NOTREACHED */
6075
6076   case MINMOD: /* next op will be non-greedy, e.g. A*?  */
6077    minmod = 1;
6078    break;
6079
6080 #undef  ST
6081 #define ST st->u.curlym
6082
6083   case CURLYM: /* /A{m,n}B/ where A is fixed-length */
6084
6085    /* This is an optimisation of CURLYX that enables us to push
6086    * only a single backtracking state, no matter how many matches
6087    * there are in {m,n}. It relies on the pattern being constant
6088    * length, with no parens to influence future backrefs
6089    */
6090
6091    ST.me = scan;
6092    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6093
6094    ST.lastparen      = rex->lastparen;
6095    ST.lastcloseparen = rex->lastcloseparen;
6096
6097    /* if paren positive, emulate an OPEN/CLOSE around A */
6098    if (ST.me->flags) {
6099     U32 paren = ST.me->flags;
6100     if (paren > maxopenparen)
6101      maxopenparen = paren;
6102     scan += NEXT_OFF(scan); /* Skip former OPEN. */
6103    }
6104    ST.A = scan;
6105    ST.B = next;
6106    ST.alen = 0;
6107    ST.count = 0;
6108    ST.minmod = minmod;
6109    minmod = 0;
6110    ST.c1 = CHRTEST_UNINIT;
6111    REGCP_SET(ST.cp);
6112
6113    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
6114     goto curlym_do_B;
6115
6116   curlym_do_A: /* execute the A in /A{m,n}B/  */
6117    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
6118    assert(0); /* NOTREACHED */
6119
6120   case CURLYM_A: /* we've just matched an A */
6121    ST.count++;
6122    /* after first match, determine A's length: u.curlym.alen */
6123    if (ST.count == 1) {
6124     if (reginfo->is_utf8_target) {
6125      char *s = st->locinput;
6126      while (s < locinput) {
6127       ST.alen++;
6128       s += UTF8SKIP(s);
6129      }
6130     }
6131     else {
6132      ST.alen = locinput - st->locinput;
6133     }
6134     if (ST.alen == 0)
6135      ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
6136    }
6137    DEBUG_EXECUTE_r(
6138     PerlIO_printf(Perl_debug_log,
6139       "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
6140       (int)(REPORT_CODE_OFF+(depth*2)), "",
6141       (IV) ST.count, (IV)ST.alen)
6142    );
6143
6144    if (cur_eval && cur_eval->u.eval.close_paren &&
6145     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6146     goto fake_end;
6147
6148    {
6149     I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
6150     if ( max == REG_INFTY || ST.count < max )
6151      goto curlym_do_A; /* try to match another A */
6152    }
6153    goto curlym_do_B; /* try to match B */
6154
6155   case CURLYM_A_fail: /* just failed to match an A */
6156    REGCP_UNWIND(ST.cp);
6157
6158    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
6159     || (cur_eval && cur_eval->u.eval.close_paren &&
6160      cur_eval->u.eval.close_paren == (U32)ST.me->flags))
6161     sayNO;
6162
6163   curlym_do_B: /* execute the B in /A{m,n}B/  */
6164    if (ST.c1 == CHRTEST_UNINIT) {
6165     /* calculate c1 and c2 for possible match of 1st char
6166     * following curly */
6167     ST.c1 = ST.c2 = CHRTEST_VOID;
6168     if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
6169      regnode *text_node = ST.B;
6170      if (! HAS_TEXT(text_node))
6171       FIND_NEXT_IMPT(text_node);
6172      /* this used to be
6173
6174       (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
6175
6176        But the former is redundant in light of the latter.
6177
6178        if this changes back then the macro for
6179        IS_TEXT and friends need to change.
6180      */
6181      if (PL_regkind[OP(text_node)] == EXACT) {
6182       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6183       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6184       reginfo))
6185       {
6186        sayNO;
6187       }
6188      }
6189     }
6190    }
6191
6192    DEBUG_EXECUTE_r(
6193     PerlIO_printf(Perl_debug_log,
6194      "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
6195      (int)(REPORT_CODE_OFF+(depth*2)),
6196      "", (IV)ST.count)
6197     );
6198    if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
6199     if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
6200      if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6201       && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6202      {
6203       /* simulate B failing */
6204       DEBUG_OPTIMISE_r(
6205        PerlIO_printf(Perl_debug_log,
6206         "%*s  CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
6207         (int)(REPORT_CODE_OFF+(depth*2)),"",
6208         valid_utf8_to_uvchr((U8 *) locinput, NULL),
6209         valid_utf8_to_uvchr(ST.c1_utf8, NULL),
6210         valid_utf8_to_uvchr(ST.c2_utf8, NULL))
6211       );
6212       state_num = CURLYM_B_fail;
6213       goto reenter_switch;
6214      }
6215     }
6216     else if (nextchr != ST.c1 && nextchr != ST.c2) {
6217      /* simulate B failing */
6218      DEBUG_OPTIMISE_r(
6219       PerlIO_printf(Perl_debug_log,
6220        "%*s  CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
6221        (int)(REPORT_CODE_OFF+(depth*2)),"",
6222        (int) nextchr, ST.c1, ST.c2)
6223      );
6224      state_num = CURLYM_B_fail;
6225      goto reenter_switch;
6226     }
6227    }
6228
6229    if (ST.me->flags) {
6230     /* emulate CLOSE: mark current A as captured */
6231     I32 paren = ST.me->flags;
6232     if (ST.count) {
6233      rex->offs[paren].start
6234       = HOPc(locinput, -ST.alen) - reginfo->strbeg;
6235      rex->offs[paren].end = locinput - reginfo->strbeg;
6236      if ((U32)paren > rex->lastparen)
6237       rex->lastparen = paren;
6238      rex->lastcloseparen = paren;
6239     }
6240     else
6241      rex->offs[paren].end = -1;
6242     if (cur_eval && cur_eval->u.eval.close_paren &&
6243      cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6244     {
6245      if (ST.count)
6246       goto fake_end;
6247      else
6248       sayNO;
6249     }
6250    }
6251
6252    PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
6253    assert(0); /* NOTREACHED */
6254
6255   case CURLYM_B_fail: /* just failed to match a B */
6256    REGCP_UNWIND(ST.cp);
6257    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6258    if (ST.minmod) {
6259     I32 max = ARG2(ST.me);
6260     if (max != REG_INFTY && ST.count == max)
6261      sayNO;
6262     goto curlym_do_A; /* try to match a further A */
6263    }
6264    /* backtrack one A */
6265    if (ST.count == ARG1(ST.me) /* min */)
6266     sayNO;
6267    ST.count--;
6268    SET_locinput(HOPc(locinput, -ST.alen));
6269    goto curlym_do_B; /* try to match B */
6270
6271 #undef ST
6272 #define ST st->u.curly
6273
6274 #define CURLY_SETPAREN(paren, success) \
6275  if (paren) { \
6276   if (success) { \
6277    rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6278    rex->offs[paren].end = locinput - reginfo->strbeg; \
6279    if (paren > rex->lastparen) \
6280     rex->lastparen = paren; \
6281    rex->lastcloseparen = paren; \
6282   } \
6283   else { \
6284    rex->offs[paren].end = -1; \
6285    rex->lastparen      = ST.lastparen; \
6286    rex->lastcloseparen = ST.lastcloseparen; \
6287   } \
6288  }
6289
6290   case STAR:  /*  /A*B/ where A is width 1 char */
6291    ST.paren = 0;
6292    ST.min = 0;
6293    ST.max = REG_INFTY;
6294    scan = NEXTOPER(scan);
6295    goto repeat;
6296
6297   case PLUS:  /*  /A+B/ where A is width 1 char */
6298    ST.paren = 0;
6299    ST.min = 1;
6300    ST.max = REG_INFTY;
6301    scan = NEXTOPER(scan);
6302    goto repeat;
6303
6304   case CURLYN:  /*  /(A){m,n}B/ where A is width 1 char */
6305    ST.paren = scan->flags; /* Which paren to set */
6306    ST.lastparen      = rex->lastparen;
6307    ST.lastcloseparen = rex->lastcloseparen;
6308    if (ST.paren > maxopenparen)
6309     maxopenparen = ST.paren;
6310    ST.min = ARG1(scan);  /* min to match */
6311    ST.max = ARG2(scan);  /* max to match */
6312    if (cur_eval && cur_eval->u.eval.close_paren &&
6313     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6314     ST.min=1;
6315     ST.max=1;
6316    }
6317    scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6318    goto repeat;
6319
6320   case CURLY:  /*  /A{m,n}B/ where A is width 1 char */
6321    ST.paren = 0;
6322    ST.min = ARG1(scan);  /* min to match */
6323    ST.max = ARG2(scan);  /* max to match */
6324    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6325   repeat:
6326    /*
6327    * Lookahead to avoid useless match attempts
6328    * when we know what character comes next.
6329    *
6330    * Used to only do .*x and .*?x, but now it allows
6331    * for )'s, ('s and (?{ ... })'s to be in the way
6332    * of the quantifier and the EXACT-like node.  -- japhy
6333    */
6334
6335    assert(ST.min <= ST.max);
6336    if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6337     ST.c1 = ST.c2 = CHRTEST_VOID;
6338    }
6339    else {
6340     regnode *text_node = next;
6341
6342     if (! HAS_TEXT(text_node))
6343      FIND_NEXT_IMPT(text_node);
6344
6345     if (! HAS_TEXT(text_node))
6346      ST.c1 = ST.c2 = CHRTEST_VOID;
6347     else {
6348      if ( PL_regkind[OP(text_node)] != EXACT ) {
6349       ST.c1 = ST.c2 = CHRTEST_VOID;
6350      }
6351      else {
6352
6353      /*  Currently we only get here when
6354
6355       PL_rekind[OP(text_node)] == EXACT
6356
6357       if this changes back then the macro for IS_TEXT and
6358       friends need to change. */
6359       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6360       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6361       reginfo))
6362       {
6363        sayNO;
6364       }
6365      }
6366     }
6367    }
6368
6369    ST.A = scan;
6370    ST.B = next;
6371    if (minmod) {
6372     char *li = locinput;
6373     minmod = 0;
6374     if (ST.min &&
6375       regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6376        < ST.min)
6377      sayNO;
6378     SET_locinput(li);
6379     ST.count = ST.min;
6380     REGCP_SET(ST.cp);
6381     if (ST.c1 == CHRTEST_VOID)
6382      goto curly_try_B_min;
6383
6384     ST.oldloc = locinput;
6385
6386     /* set ST.maxpos to the furthest point along the
6387     * string that could possibly match */
6388     if  (ST.max == REG_INFTY) {
6389      ST.maxpos = reginfo->strend - 1;
6390      if (utf8_target)
6391       while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6392        ST.maxpos--;
6393     }
6394     else if (utf8_target) {
6395      int m = ST.max - ST.min;
6396      for (ST.maxpos = locinput;
6397       m >0 && ST.maxpos < reginfo->strend; m--)
6398       ST.maxpos += UTF8SKIP(ST.maxpos);
6399     }
6400     else {
6401      ST.maxpos = locinput + ST.max - ST.min;
6402      if (ST.maxpos >= reginfo->strend)
6403       ST.maxpos = reginfo->strend - 1;
6404     }
6405     goto curly_try_B_min_known;
6406
6407    }
6408    else {
6409     /* avoid taking address of locinput, so it can remain
6410     * a register var */
6411     char *li = locinput;
6412     ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6413     if (ST.count < ST.min)
6414      sayNO;
6415     SET_locinput(li);
6416     if ((ST.count > ST.min)
6417      && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6418     {
6419      /* A{m,n} must come at the end of the string, there's
6420      * no point in backing off ... */
6421      ST.min = ST.count;
6422      /* ...except that $ and \Z can match before *and* after
6423      newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
6424      We may back off by one in this case. */
6425      if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6426       ST.min--;
6427     }
6428     REGCP_SET(ST.cp);
6429     goto curly_try_B_max;
6430    }
6431    assert(0); /* NOTREACHED */
6432
6433
6434   case CURLY_B_min_known_fail:
6435    /* failed to find B in a non-greedy match where c1,c2 valid */
6436
6437    REGCP_UNWIND(ST.cp);
6438    if (ST.paren) {
6439     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6440    }
6441    /* Couldn't or didn't -- move forward. */
6442    ST.oldloc = locinput;
6443    if (utf8_target)
6444     locinput += UTF8SKIP(locinput);
6445    else
6446     locinput++;
6447    ST.count++;
6448   curly_try_B_min_known:
6449    /* find the next place where 'B' could work, then call B */
6450    {
6451     int n;
6452     if (utf8_target) {
6453      n = (ST.oldloc == locinput) ? 0 : 1;
6454      if (ST.c1 == ST.c2) {
6455       /* set n to utf8_distance(oldloc, locinput) */
6456       while (locinput <= ST.maxpos
6457        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6458       {
6459        locinput += UTF8SKIP(locinput);
6460        n++;
6461       }
6462      }
6463      else {
6464       /* set n to utf8_distance(oldloc, locinput) */
6465       while (locinput <= ST.maxpos
6466        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6467        && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6468       {
6469        locinput += UTF8SKIP(locinput);
6470        n++;
6471       }
6472      }
6473     }
6474     else {  /* Not utf8_target */
6475      if (ST.c1 == ST.c2) {
6476       while (locinput <= ST.maxpos &&
6477        UCHARAT(locinput) != ST.c1)
6478        locinput++;
6479      }
6480      else {
6481       while (locinput <= ST.maxpos
6482        && UCHARAT(locinput) != ST.c1
6483        && UCHARAT(locinput) != ST.c2)
6484        locinput++;
6485      }
6486      n = locinput - ST.oldloc;
6487     }
6488     if (locinput > ST.maxpos)
6489      sayNO;
6490     if (n) {
6491      /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6492      * at b; check that everything between oldloc and
6493      * locinput matches */
6494      char *li = ST.oldloc;
6495      ST.count += n;
6496      if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6497       sayNO;
6498      assert(n == REG_INFTY || locinput == li);
6499     }
6500     CURLY_SETPAREN(ST.paren, ST.count);
6501     if (cur_eval && cur_eval->u.eval.close_paren &&
6502      cur_eval->u.eval.close_paren == (U32)ST.paren) {
6503      goto fake_end;
6504     }
6505     PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6506    }
6507    assert(0); /* NOTREACHED */
6508
6509
6510   case CURLY_B_min_fail:
6511    /* failed to find B in a non-greedy match where c1,c2 invalid */
6512
6513    REGCP_UNWIND(ST.cp);
6514    if (ST.paren) {
6515     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6516    }
6517    /* failed -- move forward one */
6518    {
6519     char *li = locinput;
6520     if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6521      sayNO;
6522     }
6523     locinput = li;
6524    }
6525    {
6526     ST.count++;
6527     if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6528       ST.count > 0)) /* count overflow ? */
6529     {
6530     curly_try_B_min:
6531      CURLY_SETPAREN(ST.paren, ST.count);
6532      if (cur_eval && cur_eval->u.eval.close_paren &&
6533       cur_eval->u.eval.close_paren == (U32)ST.paren) {
6534       goto fake_end;
6535      }
6536      PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6537     }
6538    }
6539    sayNO;
6540    assert(0); /* NOTREACHED */
6541
6542
6543   curly_try_B_max:
6544    /* a successful greedy match: now try to match B */
6545    if (cur_eval && cur_eval->u.eval.close_paren &&
6546     cur_eval->u.eval.close_paren == (U32)ST.paren) {
6547     goto fake_end;
6548    }
6549    {
6550     bool could_match = locinput < reginfo->strend;
6551
6552     /* If it could work, try it. */
6553     if (ST.c1 != CHRTEST_VOID && could_match) {
6554      if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6555      {
6556       could_match = memEQ(locinput,
6557            ST.c1_utf8,
6558            UTF8SKIP(locinput))
6559          || memEQ(locinput,
6560            ST.c2_utf8,
6561            UTF8SKIP(locinput));
6562      }
6563      else {
6564       could_match = UCHARAT(locinput) == ST.c1
6565          || UCHARAT(locinput) == ST.c2;
6566      }
6567     }
6568     if (ST.c1 == CHRTEST_VOID || could_match) {
6569      CURLY_SETPAREN(ST.paren, ST.count);
6570      PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6571      assert(0); /* NOTREACHED */
6572     }
6573    }
6574    /* FALL THROUGH */
6575
6576   case CURLY_B_max_fail:
6577    /* failed to find B in a greedy match */
6578
6579    REGCP_UNWIND(ST.cp);
6580    if (ST.paren) {
6581     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6582    }
6583    /*  back up. */
6584    if (--ST.count < ST.min)
6585     sayNO;
6586    locinput = HOPc(locinput, -1);
6587    goto curly_try_B_max;
6588
6589 #undef ST
6590
6591   case END: /*  last op of main pattern  */
6592    fake_end:
6593    if (cur_eval) {
6594     /* we've just finished A in /(??{A})B/; now continue with B */
6595
6596     st->u.eval.prev_rex = rex_sv;  /* inner */
6597
6598     /* Save *all* the positions. */
6599     st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6600     rex_sv = cur_eval->u.eval.prev_rex;
6601     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6602     SET_reg_curpm(rex_sv);
6603     rex = ReANY(rex_sv);
6604     rexi = RXi_GET(rex);
6605     cur_curlyx = cur_eval->u.eval.prev_curlyx;
6606
6607     REGCP_SET(st->u.eval.lastcp);
6608
6609     /* Restore parens of the outer rex without popping the
6610     * savestack */
6611     S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6612           &maxopenparen);
6613
6614     st->u.eval.prev_eval = cur_eval;
6615     cur_eval = cur_eval->u.eval.prev_eval;
6616     DEBUG_EXECUTE_r(
6617      PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
6618          REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6619     if ( nochange_depth )
6620      nochange_depth--;
6621
6622     PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6623          locinput); /* match B */
6624    }
6625
6626    if (locinput < reginfo->till) {
6627     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6628          "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6629          PL_colors[4],
6630          (long)(locinput - startpos),
6631          (long)(reginfo->till - startpos),
6632          PL_colors[5]));
6633
6634     sayNO_SILENT;  /* Cannot match: too short. */
6635    }
6636    sayYES;   /* Success! */
6637
6638   case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6639    DEBUG_EXECUTE_r(
6640    PerlIO_printf(Perl_debug_log,
6641     "%*s  %ssubpattern success...%s\n",
6642     REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6643    sayYES;   /* Success! */
6644
6645 #undef  ST
6646 #define ST st->u.ifmatch
6647
6648   {
6649    char *newstart;
6650
6651   case SUSPEND: /* (?>A) */
6652    ST.wanted = 1;
6653    newstart = locinput;
6654    goto do_ifmatch;
6655
6656   case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6657    ST.wanted = 0;
6658    goto ifmatch_trivial_fail_test;
6659
6660   case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6661    ST.wanted = 1;
6662   ifmatch_trivial_fail_test:
6663    if (scan->flags) {
6664     char * const s = HOPBACKc(locinput, scan->flags);
6665     if (!s) {
6666      /* trivial fail */
6667      if (logical) {
6668       logical = 0;
6669       sw = 1 - cBOOL(ST.wanted);
6670      }
6671      else if (ST.wanted)
6672       sayNO;
6673      next = scan + ARG(scan);
6674      if (next == scan)
6675       next = NULL;
6676      break;
6677     }
6678     newstart = s;
6679    }
6680    else
6681     newstart = locinput;
6682
6683   do_ifmatch:
6684    ST.me = scan;
6685    ST.logical = logical;
6686    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6687
6688    /* execute body of (?...A) */
6689    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6690    assert(0); /* NOTREACHED */
6691   }
6692
6693   case IFMATCH_A_fail: /* body of (?...A) failed */
6694    ST.wanted = !ST.wanted;
6695    /* FALL THROUGH */
6696
6697   case IFMATCH_A: /* body of (?...A) succeeded */
6698    if (ST.logical) {
6699     sw = cBOOL(ST.wanted);
6700    }
6701    else if (!ST.wanted)
6702     sayNO;
6703
6704    if (OP(ST.me) != SUSPEND) {
6705     /* restore old position except for (?>...) */
6706     locinput = st->locinput;
6707    }
6708    scan = ST.me + ARG(ST.me);
6709    if (scan == ST.me)
6710     scan = NULL;
6711    continue; /* execute B */
6712
6713 #undef ST
6714
6715   case LONGJMP: /*  alternative with many branches compiles to
6716      * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6717    next = scan + ARG(scan);
6718    if (next == scan)
6719     next = NULL;
6720    break;
6721
6722   case COMMIT:  /*  (*COMMIT)  */
6723    reginfo->cutpoint = reginfo->strend;
6724    /* FALLTHROUGH */
6725
6726   case PRUNE:   /*  (*PRUNE)   */
6727    if (!scan->flags)
6728     sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6729    PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6730    assert(0); /* NOTREACHED */
6731
6732   case COMMIT_next_fail:
6733    no_final = 1;
6734    /* FALLTHROUGH */
6735
6736   case OPFAIL:   /* (*FAIL)  */
6737    sayNO;
6738    assert(0); /* NOTREACHED */
6739
6740 #define ST st->u.mark
6741   case MARKPOINT: /*  (*MARK:foo)  */
6742    ST.prev_mark = mark_state;
6743    ST.mark_name = sv_commit = sv_yes_mark
6744     = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6745    mark_state = st;
6746    ST.mark_loc = locinput;
6747    PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6748    assert(0); /* NOTREACHED */
6749
6750   case MARKPOINT_next:
6751    mark_state = ST.prev_mark;
6752    sayYES;
6753    assert(0); /* NOTREACHED */
6754
6755   case MARKPOINT_next_fail:
6756    if (popmark && sv_eq(ST.mark_name,popmark))
6757    {
6758     if (ST.mark_loc > startpoint)
6759      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6760     popmark = NULL; /* we found our mark */
6761     sv_commit = ST.mark_name;
6762
6763     DEBUG_EXECUTE_r({
6764       PerlIO_printf(Perl_debug_log,
6765        "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
6766        REPORT_CODE_OFF+depth*2, "",
6767        PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6768     });
6769    }
6770    mark_state = ST.prev_mark;
6771    sv_yes_mark = mark_state ?
6772     mark_state->u.mark.mark_name : NULL;
6773    sayNO;
6774    assert(0); /* NOTREACHED */
6775
6776   case SKIP:  /*  (*SKIP)  */
6777    if (scan->flags) {
6778     /* (*SKIP) : if we fail we cut here*/
6779     ST.mark_name = NULL;
6780     ST.mark_loc = locinput;
6781     PUSH_STATE_GOTO(SKIP_next,next, locinput);
6782    } else {
6783     /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6784     otherwise do nothing.  Meaning we need to scan
6785     */
6786     regmatch_state *cur = mark_state;
6787     SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6788
6789     while (cur) {
6790      if ( sv_eq( cur->u.mark.mark_name,
6791         find ) )
6792      {
6793       ST.mark_name = find;
6794       PUSH_STATE_GOTO( SKIP_next, next, locinput);
6795      }
6796      cur = cur->u.mark.prev_mark;
6797     }
6798    }
6799    /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6800    break;
6801
6802   case SKIP_next_fail:
6803    if (ST.mark_name) {
6804     /* (*CUT:NAME) - Set up to search for the name as we
6805     collapse the stack*/
6806     popmark = ST.mark_name;
6807    } else {
6808     /* (*CUT) - No name, we cut here.*/
6809     if (ST.mark_loc > startpoint)
6810      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6811     /* but we set sv_commit to latest mark_name if there
6812     is one so they can test to see how things lead to this
6813     cut */
6814     if (mark_state)
6815      sv_commit=mark_state->u.mark.mark_name;
6816    }
6817    no_final = 1;
6818    sayNO;
6819    assert(0); /* NOTREACHED */
6820 #undef ST
6821
6822   case LNBREAK: /* \R */
6823    if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6824     locinput += n;
6825    } else
6826     sayNO;
6827    break;
6828
6829   default:
6830    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6831       PTR2UV(scan), OP(scan));
6832    Perl_croak(aTHX_ "regexp memory corruption");
6833
6834   /* this is a point to jump to in order to increment
6835   * locinput by one character */
6836   increment_locinput:
6837    assert(!NEXTCHR_IS_EOS);
6838    if (utf8_target) {
6839     locinput += PL_utf8skip[nextchr];
6840     /* locinput is allowed to go 1 char off the end, but not 2+ */
6841     if (locinput > reginfo->strend)
6842      sayNO;
6843    }
6844    else
6845     locinput++;
6846    break;
6847
6848   } /* end switch */
6849
6850   /* switch break jumps here */
6851   scan = next; /* prepare to execute the next op and ... */
6852   continue;    /* ... jump back to the top, reusing st */
6853   assert(0); /* NOTREACHED */
6854
6855  push_yes_state:
6856   /* push a state that backtracks on success */
6857   st->u.yes.prev_yes_state = yes_state;
6858   yes_state = st;
6859   /* FALL THROUGH */
6860  push_state:
6861   /* push a new regex state, then continue at scan  */
6862   {
6863    regmatch_state *newst;
6864
6865    DEBUG_STACK_r({
6866     regmatch_state *cur = st;
6867     regmatch_state *curyes = yes_state;
6868     int curd = depth;
6869     regmatch_slab *slab = PL_regmatch_slab;
6870     for (;curd > -1;cur--,curd--) {
6871      if (cur < SLAB_FIRST(slab)) {
6872       slab = slab->prev;
6873       cur = SLAB_LAST(slab);
6874      }
6875      PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6876       REPORT_CODE_OFF + 2 + depth * 2,"",
6877       curd, PL_reg_name[cur->resume_state],
6878       (curyes == cur) ? "yes" : ""
6879      );
6880      if (curyes == cur)
6881       curyes = cur->u.yes.prev_yes_state;
6882     }
6883    } else
6884     DEBUG_STATE_pp("push")
6885    );
6886    depth++;
6887    st->locinput = locinput;
6888    newst = st+1;
6889    if (newst >  SLAB_LAST(PL_regmatch_slab))
6890     newst = S_push_slab(aTHX);
6891    PL_regmatch_state = newst;
6892
6893    locinput = pushinput;
6894    st = newst;
6895    continue;
6896    assert(0); /* NOTREACHED */
6897   }
6898  }
6899
6900  /*
6901  * We get here only if there's trouble -- normally "case END" is
6902  * the terminating point.
6903  */
6904  Perl_croak(aTHX_ "corrupted regexp pointers");
6905  /*NOTREACHED*/
6906  sayNO;
6907
6908 yes:
6909  if (yes_state) {
6910   /* we have successfully completed a subexpression, but we must now
6911   * pop to the state marked by yes_state and continue from there */
6912   assert(st != yes_state);
6913 #ifdef DEBUGGING
6914   while (st != yes_state) {
6915    st--;
6916    if (st < SLAB_FIRST(PL_regmatch_slab)) {
6917     PL_regmatch_slab = PL_regmatch_slab->prev;
6918     st = SLAB_LAST(PL_regmatch_slab);
6919    }
6920    DEBUG_STATE_r({
6921     if (no_final) {
6922      DEBUG_STATE_pp("pop (no final)");
6923     } else {
6924      DEBUG_STATE_pp("pop (yes)");
6925     }
6926    });
6927    depth--;
6928   }
6929 #else
6930   while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6931    || yes_state > SLAB_LAST(PL_regmatch_slab))
6932   {
6933    /* not in this slab, pop slab */
6934    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6935    PL_regmatch_slab = PL_regmatch_slab->prev;
6936    st = SLAB_LAST(PL_regmatch_slab);
6937   }
6938   depth -= (st - yes_state);
6939 #endif
6940   st = yes_state;
6941   yes_state = st->u.yes.prev_yes_state;
6942   PL_regmatch_state = st;
6943
6944   if (no_final)
6945    locinput= st->locinput;
6946   state_num = st->resume_state + no_final;
6947   goto reenter_switch;
6948  }
6949
6950  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6951       PL_colors[4], PL_colors[5]));
6952
6953  if (reginfo->info_aux_eval) {
6954   /* each successfully executed (?{...}) block does the equivalent of
6955   *   local $^R = do {...}
6956   * When popping the save stack, all these locals would be undone;
6957   * bypass this by setting the outermost saved $^R to the latest
6958   * value */
6959   /* I dont know if this is needed or works properly now.
6960   * see code related to PL_replgv elsewhere in this file.
6961   * Yves
6962   */
6963   if (oreplsv != GvSV(PL_replgv))
6964    sv_setsv(oreplsv, GvSV(PL_replgv));
6965  }
6966  result = 1;
6967  goto final_exit;
6968
6969 no:
6970  DEBUG_EXECUTE_r(
6971   PerlIO_printf(Perl_debug_log,
6972    "%*s  %sfailed...%s\n",
6973    REPORT_CODE_OFF+depth*2, "",
6974    PL_colors[4], PL_colors[5])
6975   );
6976
6977 no_silent:
6978  if (no_final) {
6979   if (yes_state) {
6980    goto yes;
6981   } else {
6982    goto final_exit;
6983   }
6984  }
6985  if (depth) {
6986   /* there's a previous state to backtrack to */
6987   st--;
6988   if (st < SLAB_FIRST(PL_regmatch_slab)) {
6989    PL_regmatch_slab = PL_regmatch_slab->prev;
6990    st = SLAB_LAST(PL_regmatch_slab);
6991   }
6992   PL_regmatch_state = st;
6993   locinput= st->locinput;
6994
6995   DEBUG_STATE_pp("pop");
6996   depth--;
6997   if (yes_state == st)
6998    yes_state = st->u.yes.prev_yes_state;
6999
7000   state_num = st->resume_state + 1; /* failure = success + 1 */
7001   goto reenter_switch;
7002  }
7003  result = 0;
7004
7005   final_exit:
7006  if (rex->intflags & PREGf_VERBARG_SEEN) {
7007   SV *sv_err = get_sv("REGERROR", 1);
7008   SV *sv_mrk = get_sv("REGMARK", 1);
7009   if (result) {
7010    sv_commit = &PL_sv_no;
7011    if (!sv_yes_mark)
7012     sv_yes_mark = &PL_sv_yes;
7013   } else {
7014    if (!sv_commit)
7015     sv_commit = &PL_sv_yes;
7016    sv_yes_mark = &PL_sv_no;
7017   }
7018   sv_setsv(sv_err, sv_commit);
7019   sv_setsv(sv_mrk, sv_yes_mark);
7020  }
7021
7022
7023  if (last_pushed_cv) {
7024   dSP;
7025   POP_MULTICALL;
7026   PERL_UNUSED_VAR(SP);
7027  }
7028
7029  assert(!result ||  locinput - reginfo->strbeg >= 0);
7030  return result ?  locinput - reginfo->strbeg : -1;
7031 }
7032
7033 /*
7034  - regrepeat - repeatedly match something simple, report how many
7035  *
7036  * What 'simple' means is a node which can be the operand of a quantifier like
7037  * '+', or {1,3}
7038  *
7039  * startposp - pointer a pointer to the start position.  This is updated
7040  *             to point to the byte following the highest successful
7041  *             match.
7042  * p         - the regnode to be repeatedly matched against.
7043  * reginfo   - struct holding match state, such as strend
7044  * max       - maximum number of things to match.
7045  * depth     - (for debugging) backtracking depth.
7046  */
7047 STATIC I32
7048 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
7049    regmatch_info *const reginfo, I32 max, int depth)
7050 {
7051  dVAR;
7052  char *scan;     /* Pointer to current position in target string */
7053  I32 c;
7054  char *loceol = reginfo->strend;   /* local version */
7055  I32 hardcount = 0;  /* How many matches so far */
7056  bool utf8_target = reginfo->is_utf8_target;
7057  int to_complement = 0;  /* Invert the result? */
7058  UV utf8_flags;
7059  _char_class_number classnum;
7060 #ifndef DEBUGGING
7061  PERL_UNUSED_ARG(depth);
7062 #endif
7063
7064  PERL_ARGS_ASSERT_REGREPEAT;
7065
7066  scan = *startposp;
7067  if (max == REG_INFTY)
7068   max = I32_MAX;
7069  else if (! utf8_target && loceol - scan > max)
7070   loceol = scan + max;
7071
7072  /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
7073  * to the maximum of how far we should go in it (leaving it set to the real
7074  * end, if the maximum permissible would take us beyond that).  This allows
7075  * us to make the loop exit condition that we haven't gone past <loceol> to
7076  * also mean that we haven't exceeded the max permissible count, saving a
7077  * test each time through the loop.  But it assumes that the OP matches a
7078  * single byte, which is true for most of the OPs below when applied to a
7079  * non-UTF-8 target.  Those relatively few OPs that don't have this
7080  * characteristic will have to compensate.
7081  *
7082  * There is no adjustment for UTF-8 targets, as the number of bytes per
7083  * character varies.  OPs will have to test both that the count is less
7084  * than the max permissible (using <hardcount> to keep track), and that we
7085  * are still within the bounds of the string (using <loceol>.  A few OPs
7086  * match a single byte no matter what the encoding.  They can omit the max
7087  * test if, for the UTF-8 case, they do the adjustment that was skipped
7088  * above.
7089  *
7090  * Thus, the code above sets things up for the common case; and exceptional
7091  * cases need extra work; the common case is to make sure <scan> doesn't
7092  * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
7093  * count doesn't exceed the maximum permissible */
7094
7095  switch (OP(p)) {
7096  case REG_ANY:
7097   if (utf8_target) {
7098    while (scan < loceol && hardcount < max && *scan != '\n') {
7099     scan += UTF8SKIP(scan);
7100     hardcount++;
7101    }
7102   } else {
7103    while (scan < loceol && *scan != '\n')
7104     scan++;
7105   }
7106   break;
7107  case SANY:
7108   if (utf8_target) {
7109    while (scan < loceol && hardcount < max) {
7110     scan += UTF8SKIP(scan);
7111     hardcount++;
7112    }
7113   }
7114   else
7115    scan = loceol;
7116   break;
7117  case CANY:  /* Move <scan> forward <max> bytes, unless goes off end */
7118   if (utf8_target && loceol - scan > max) {
7119
7120    /* <loceol> hadn't been adjusted in the UTF-8 case */
7121    scan +=  max;
7122   }
7123   else {
7124    scan = loceol;
7125   }
7126   break;
7127  case EXACT:
7128   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7129
7130   c = (U8)*STRING(p);
7131
7132   /* Can use a simple loop if the pattern char to match on is invariant
7133   * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
7134   * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
7135   * true iff it doesn't matter if the argument is in UTF-8 or not */
7136   if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
7137    if (utf8_target && loceol - scan > max) {
7138     /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7139     * since here, to match at all, 1 char == 1 byte */
7140     loceol = scan + max;
7141    }
7142    while (scan < loceol && UCHARAT(scan) == c) {
7143     scan++;
7144    }
7145   }
7146   else if (reginfo->is_utf8_pat) {
7147    if (utf8_target) {
7148     STRLEN scan_char_len;
7149
7150     /* When both target and pattern are UTF-8, we have to do
7151     * string EQ */
7152     while (hardcount < max
7153      && scan < loceol
7154      && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
7155      && memEQ(scan, STRING(p), scan_char_len))
7156     {
7157      scan += scan_char_len;
7158      hardcount++;
7159     }
7160    }
7161    else if (! UTF8_IS_ABOVE_LATIN1(c)) {
7162
7163     /* Target isn't utf8; convert the character in the UTF-8
7164     * pattern to non-UTF8, and do a simple loop */
7165     c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
7166     while (scan < loceol && UCHARAT(scan) == c) {
7167      scan++;
7168     }
7169    } /* else pattern char is above Latin1, can't possibly match the
7170     non-UTF-8 target */
7171   }
7172   else {
7173
7174    /* Here, the string must be utf8; pattern isn't, and <c> is
7175    * different in utf8 than not, so can't compare them directly.
7176    * Outside the loop, find the two utf8 bytes that represent c, and
7177    * then look for those in sequence in the utf8 string */
7178    U8 high = UTF8_TWO_BYTE_HI(c);
7179    U8 low = UTF8_TWO_BYTE_LO(c);
7180
7181    while (hardcount < max
7182      && scan + 1 < loceol
7183      && UCHARAT(scan) == high
7184      && UCHARAT(scan + 1) == low)
7185    {
7186     scan += 2;
7187     hardcount++;
7188    }
7189   }
7190   break;
7191
7192  case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
7193   assert(! reginfo->is_utf8_pat);
7194   /* FALL THROUGH */
7195  case EXACTFA:
7196   utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7197   goto do_exactf;
7198
7199  case EXACTFL:
7200   utf8_flags = FOLDEQ_LOCALE;
7201   goto do_exactf;
7202
7203  case EXACTF:   /* This node only generated for non-utf8 patterns */
7204   assert(! reginfo->is_utf8_pat);
7205   utf8_flags = 0;
7206   goto do_exactf;
7207
7208  case EXACTFU_SS:
7209  case EXACTFU:
7210   utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
7211
7212  do_exactf: {
7213   int c1, c2;
7214   U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
7215
7216   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7217
7218   if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
7219           reginfo))
7220   {
7221    if (c1 == CHRTEST_VOID) {
7222     /* Use full Unicode fold matching */
7223     char *tmpeol = reginfo->strend;
7224     STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
7225     while (hardcount < max
7226       && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
7227            STRING(p), NULL, pat_len,
7228            reginfo->is_utf8_pat, utf8_flags))
7229     {
7230      scan = tmpeol;
7231      tmpeol = reginfo->strend;
7232      hardcount++;
7233     }
7234    }
7235    else if (utf8_target) {
7236     if (c1 == c2) {
7237      while (scan < loceol
7238       && hardcount < max
7239       && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
7240      {
7241       scan += UTF8SKIP(scan);
7242       hardcount++;
7243      }
7244     }
7245     else {
7246      while (scan < loceol
7247       && hardcount < max
7248       && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
7249        || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
7250      {
7251       scan += UTF8SKIP(scan);
7252       hardcount++;
7253      }
7254     }
7255    }
7256    else if (c1 == c2) {
7257     while (scan < loceol && UCHARAT(scan) == c1) {
7258      scan++;
7259     }
7260    }
7261    else {
7262     while (scan < loceol &&
7263      (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
7264     {
7265      scan++;
7266     }
7267    }
7268   }
7269   break;
7270  }
7271  case ANYOF:
7272   if (utf8_target) {
7273    while (hardcount < max
7274     && scan < loceol
7275     && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
7276    {
7277     scan += UTF8SKIP(scan);
7278     hardcount++;
7279    }
7280   } else {
7281    while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
7282     scan++;
7283   }
7284   break;
7285
7286  /* The argument (FLAGS) to all the POSIX node types is the class number */
7287
7288  case NPOSIXL:
7289   to_complement = 1;
7290   /* FALLTHROUGH */
7291
7292  case POSIXL:
7293   if (! utf8_target) {
7294    while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7295                 *scan)))
7296    {
7297     scan++;
7298    }
7299   } else {
7300    while (hardcount < max && scan < loceol
7301     && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7302                 (U8 *) scan)))
7303    {
7304     scan += UTF8SKIP(scan);
7305     hardcount++;
7306    }
7307   }
7308   break;
7309
7310  case POSIXD:
7311   if (utf8_target) {
7312    goto utf8_posix;
7313   }
7314   /* FALLTHROUGH */
7315
7316  case POSIXA:
7317   if (utf8_target && loceol - scan > max) {
7318
7319    /* We didn't adjust <loceol> at the beginning of this routine
7320    * because is UTF-8, but it is actually ok to do so, since here, to
7321    * match, 1 char == 1 byte. */
7322    loceol = scan + max;
7323   }
7324   while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
7325    scan++;
7326   }
7327   break;
7328
7329  case NPOSIXD:
7330   if (utf8_target) {
7331    to_complement = 1;
7332    goto utf8_posix;
7333   }
7334   /* FALL THROUGH */
7335
7336  case NPOSIXA:
7337   if (! utf8_target) {
7338    while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
7339     scan++;
7340    }
7341   }
7342   else {
7343
7344    /* The complement of something that matches only ASCII matches all
7345    * UTF-8 variant code points, plus everything in ASCII that isn't
7346    * in the class. */
7347    while (hardcount < max && scan < loceol
7348     && (! UTF8_IS_INVARIANT(*scan)
7349      || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7350    {
7351     scan += UTF8SKIP(scan);
7352     hardcount++;
7353    }
7354   }
7355   break;
7356
7357  case NPOSIXU:
7358   to_complement = 1;
7359   /* FALLTHROUGH */
7360
7361  case POSIXU:
7362   if (! utf8_target) {
7363    while (scan < loceol && to_complement
7364         ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7365    {
7366     scan++;
7367    }
7368   }
7369   else {
7370  utf8_posix:
7371    classnum = (_char_class_number) FLAGS(p);
7372    if (classnum < _FIRST_NON_SWASH_CC) {
7373
7374     /* Here, a swash is needed for above-Latin1 code points.
7375     * Process as many Latin1 code points using the built-in rules.
7376     * Go to another loop to finish processing upon encountering
7377     * the first Latin1 code point.  We could do that in this loop
7378     * as well, but the other way saves having to test if the swash
7379     * has been loaded every time through the loop: extra space to
7380     * save a test. */
7381     while (hardcount < max && scan < loceol) {
7382      if (UTF8_IS_INVARIANT(*scan)) {
7383       if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7384                 classnum))))
7385       {
7386        break;
7387       }
7388       scan++;
7389      }
7390      else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7391       if (! (to_complement
7392        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7393                  *(scan + 1)),
7394              classnum))))
7395       {
7396        break;
7397       }
7398       scan += 2;
7399      }
7400      else {
7401       goto found_above_latin1;
7402      }
7403
7404      hardcount++;
7405     }
7406    }
7407    else {
7408     /* For these character classes, the knowledge of how to handle
7409     * every code point is compiled in to Perl via a macro.  This
7410     * code is written for making the loops as tight as possible.
7411     * It could be refactored to save space instead */
7412     switch (classnum) {
7413      case _CC_ENUM_SPACE:    /* XXX would require separate code
7414            if we revert the change of \v
7415            matching this */
7416       /* FALL THROUGH */
7417      case _CC_ENUM_PSXSPC:
7418       while (hardcount < max
7419        && scan < loceol
7420        && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7421       {
7422        scan += UTF8SKIP(scan);
7423        hardcount++;
7424       }
7425       break;
7426      case _CC_ENUM_BLANK:
7427       while (hardcount < max
7428        && scan < loceol
7429        && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7430       {
7431        scan += UTF8SKIP(scan);
7432        hardcount++;
7433       }
7434       break;
7435      case _CC_ENUM_XDIGIT:
7436       while (hardcount < max
7437        && scan < loceol
7438        && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7439       {
7440        scan += UTF8SKIP(scan);
7441        hardcount++;
7442       }
7443       break;
7444      case _CC_ENUM_VERTSPACE:
7445       while (hardcount < max
7446        && scan < loceol
7447        && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7448       {
7449        scan += UTF8SKIP(scan);
7450        hardcount++;
7451       }
7452       break;
7453      case _CC_ENUM_CNTRL:
7454       while (hardcount < max
7455        && scan < loceol
7456        && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7457       {
7458        scan += UTF8SKIP(scan);
7459        hardcount++;
7460       }
7461       break;
7462      default:
7463       Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7464     }
7465    }
7466   }
7467   break;
7468
7469  found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
7470
7471   /* Load the swash if not already present */
7472   if (! PL_utf8_swash_ptrs[classnum]) {
7473    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7474    PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7475           "utf8",
7476           "",
7477           &PL_sv_undef, 1, 0,
7478           PL_XPosix_ptrs[classnum], &flags);
7479   }
7480
7481   while (hardcount < max && scan < loceol
7482    && to_complement ^ cBOOL(_generic_utf8(
7483          classnum,
7484          scan,
7485          swash_fetch(PL_utf8_swash_ptrs[classnum],
7486             (U8 *) scan,
7487             TRUE))))
7488   {
7489    scan += UTF8SKIP(scan);
7490    hardcount++;
7491   }
7492   break;
7493
7494  case LNBREAK:
7495   if (utf8_target) {
7496    while (hardcount < max && scan < loceol &&
7497      (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7498     scan += c;
7499     hardcount++;
7500    }
7501   } else {
7502    /* LNBREAK can match one or two latin chars, which is ok, but we
7503    * have to use hardcount in this situation, and throw away the
7504    * adjustment to <loceol> done before the switch statement */
7505    loceol = reginfo->strend;
7506    while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7507     scan+=c;
7508     hardcount++;
7509    }
7510   }
7511   break;
7512
7513  case BOUND:
7514  case BOUNDA:
7515  case BOUNDL:
7516  case BOUNDU:
7517  case EOS:
7518  case GPOS:
7519  case KEEPS:
7520  case NBOUND:
7521  case NBOUNDA:
7522  case NBOUNDL:
7523  case NBOUNDU:
7524  case OPFAIL:
7525  case SBOL:
7526  case SEOL:
7527   /* These are all 0 width, so match right here or not at all. */
7528   break;
7529
7530  default:
7531   Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7532   assert(0); /* NOTREACHED */
7533
7534  }
7535
7536  if (hardcount)
7537   c = hardcount;
7538  else
7539   c = scan - *startposp;
7540  *startposp = scan;
7541
7542  DEBUG_r({
7543   GET_RE_DEBUG_FLAGS_DECL;
7544   DEBUG_EXECUTE_r({
7545    SV * const prop = sv_newmortal();
7546    regprop(prog, prop, p, reginfo);
7547    PerlIO_printf(Perl_debug_log,
7548       "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
7549       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7550   });
7551  });
7552
7553  return(c);
7554 }
7555
7556
7557 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7558 /*
7559 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
7560 create a copy so that changes the caller makes won't change the shared one.
7561 If <altsvp> is non-null, will return NULL in it, for back-compat.
7562  */
7563 SV *
7564 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7565 {
7566  PERL_ARGS_ASSERT_REGCLASS_SWASH;
7567
7568  if (altsvp) {
7569   *altsvp = NULL;
7570  }
7571
7572  return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL));
7573 }
7574
7575 SV *
7576 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
7577           const regnode* node,
7578           bool doinit,
7579           SV** listsvp,
7580           SV** only_utf8_locale_ptr)
7581 {
7582  /* For internal core use only.
7583  * Returns the swash for the input 'node' in the regex 'prog'.
7584  * If <doinit> is 'true', will attempt to create the swash if not already
7585  *   done.
7586  * If <listsvp> is non-null, will return the printable contents of the
7587  *    swash.  This can be used to get debugging information even before the
7588  *    swash exists, by calling this function with 'doinit' set to false, in
7589  *    which case the components that will be used to eventually create the
7590  *    swash are returned  (in a printable form).
7591  * Tied intimately to how regcomp.c sets up the data structure */
7592
7593  dVAR;
7594  SV *sw  = NULL;
7595  SV *si  = NULL;         /* Input swash initialization string */
7596  SV*  invlist = NULL;
7597
7598  RXi_GET_DECL(prog,progi);
7599  const struct reg_data * const data = prog ? progi->data : NULL;
7600
7601  PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
7602
7603  assert(ANYOF_FLAGS(node)
7604       & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
7605
7606  if (data && data->count) {
7607   const U32 n = ARG(node);
7608
7609   if (data->what[n] == 's') {
7610    SV * const rv = MUTABLE_SV(data->data[n]);
7611    AV * const av = MUTABLE_AV(SvRV(rv));
7612    SV **const ary = AvARRAY(av);
7613    U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7614
7615    si = *ary; /* ary[0] = the string to initialize the swash with */
7616
7617    /* Elements 3 and 4 are either both present or both absent. [3] is
7618    * any inversion list generated at compile time; [4] indicates if
7619    * that inversion list has any user-defined properties in it. */
7620    if (av_tindex(av) >= 2) {
7621     if (only_utf8_locale_ptr
7622      && ary[2]
7623      && ary[2] != &PL_sv_undef)
7624     {
7625      *only_utf8_locale_ptr = ary[2];
7626     }
7627     else {
7628      *only_utf8_locale_ptr = NULL;
7629     }
7630
7631     if (av_tindex(av) >= 3) {
7632      invlist = ary[3];
7633      if (SvUV(ary[4])) {
7634       swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7635      }
7636     }
7637     else {
7638      invlist = NULL;
7639     }
7640    }
7641
7642    /* Element [1] is reserved for the set-up swash.  If already there,
7643    * return it; if not, create it and store it there */
7644    if (ary[1] && SvROK(ary[1])) {
7645     sw = ary[1];
7646    }
7647    else if (doinit && ((si && si != &PL_sv_undef)
7648         || (invlist && invlist != &PL_sv_undef))) {
7649
7650     sw = _core_swash_init("utf8", /* the utf8 package */
7651          "", /* nameless */
7652          si,
7653          1, /* binary */
7654          0, /* not from tr/// */
7655          invlist,
7656          &swash_init_flags);
7657     (void)av_store(av, 1, sw);
7658    }
7659   }
7660  }
7661
7662  /* If requested, return a printable version of what this swash matches */
7663  if (listsvp) {
7664   SV* matches_string = newSVpvn("", 0);
7665
7666   /* The swash should be used, if possible, to get the data, as it
7667   * contains the resolved data.  But this function can be called at
7668   * compile-time, before everything gets resolved, in which case we
7669   * return the currently best available information, which is the string
7670   * that will eventually be used to do that resolving, 'si' */
7671   if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7672    && (si && si != &PL_sv_undef))
7673   {
7674    sv_catsv(matches_string, si);
7675   }
7676
7677   /* Add the inversion list to whatever we have.  This may have come from
7678   * the swash, or from an input parameter */
7679   if (invlist) {
7680    sv_catsv(matches_string, _invlist_contents(invlist));
7681   }
7682   *listsvp = matches_string;
7683  }
7684
7685  return sw;
7686 }
7687 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
7688
7689 /*
7690  - reginclass - determine if a character falls into a character class
7691
7692   n is the ANYOF regnode
7693   p is the target string
7694   p_end points to one byte beyond the end of the target string
7695   utf8_target tells whether p is in UTF-8.
7696
7697   Returns true if matched; false otherwise.
7698
7699   Note that this can be a synthetic start class, a combination of various
7700   nodes, so things you think might be mutually exclusive, such as locale,
7701   aren't.  It can match both locale and non-locale
7702
7703  */
7704
7705 STATIC bool
7706 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
7707 {
7708  dVAR;
7709  const char flags = ANYOF_FLAGS(n);
7710  bool match = FALSE;
7711  UV c = *p;
7712
7713  PERL_ARGS_ASSERT_REGINCLASS;
7714
7715  /* If c is not already the code point, get it.  Note that
7716  * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7717  if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7718   STRLEN c_len = 0;
7719   c = utf8n_to_uvchr(p, p_end - p, &c_len,
7720     (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7721     | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7722     /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7723     * UTF8_ALLOW_FFFF */
7724   if (c_len == (STRLEN)-1)
7725    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7726  }
7727
7728  /* If this character is potentially in the bitmap, check it */
7729  if (c < 256) {
7730   if (ANYOF_BITMAP_TEST(n, c))
7731    match = TRUE;
7732   else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL
7733     && ! utf8_target
7734     && ! isASCII(c))
7735   {
7736    match = TRUE;
7737   }
7738   else if (flags & ANYOF_LOCALE_FLAGS) {
7739    if (flags & ANYOF_LOC_FOLD) {
7740     if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
7741      match = TRUE;
7742     }
7743    }
7744    if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) {
7745
7746     /* The data structure is arranged so bits 0, 2, 4, ... are set
7747     * if the class includes the Posix character class given by
7748     * bit/2; and 1, 3, 5, ... are set if the class includes the
7749     * complemented Posix class given by int(bit/2).  So we loop
7750     * through the bits, each time changing whether we complement
7751     * the result or not.  Suppose for the sake of illustration
7752     * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
7753     * is set, it means there is a match for this ANYOF node if the
7754     * character is in the class given by the expression (0 / 2 = 0
7755     * = \w).  If it is in that class, isFOO_lc() will return 1,
7756     * and since 'to_complement' is 0, the result will stay TRUE,
7757     * and we exit the loop.  Suppose instead that bit 0 is 0, but
7758     * bit 1 is 1.  That means there is a match if the character
7759     * matches \W.  We won't bother to call isFOO_lc() on bit 0,
7760     * but will on bit 1.  On the second iteration 'to_complement'
7761     * will be 1, so the exclusive or will reverse things, so we
7762     * are testing for \W.  On the third iteration, 'to_complement'
7763     * will be 0, and we would be testing for \s; the fourth
7764     * iteration would test for \S, etc.
7765     *
7766     * Note that this code assumes that all the classes are closed
7767     * under folding.  For example, if a character matches \w, then
7768     * its fold does too; and vice versa.  This should be true for
7769     * any well-behaved locale for all the currently defined Posix
7770     * classes, except for :lower: and :upper:, which are handled
7771     * by the pseudo-class :cased: which matches if either of the
7772     * other two does.  To get rid of this assumption, an outer
7773     * loop could be used below to iterate over both the source
7774     * character, and its fold (if different) */
7775
7776     int count = 0;
7777     int to_complement = 0;
7778
7779     while (count < ANYOF_MAX) {
7780      if (ANYOF_POSIXL_TEST(n, count)
7781       && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7782      {
7783       match = TRUE;
7784       break;
7785      }
7786      count++;
7787      to_complement ^= 1;
7788     }
7789    }
7790   }
7791  }
7792
7793
7794  /* If the bitmap didn't (or couldn't) match, and something outside the
7795  * bitmap could match, try that. */
7796  if (!match) {
7797   if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) {
7798    match = TRUE; /* Everything above 255 matches */
7799   }
7800   else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
7801     || (utf8_target && (flags & ANYOF_UTF8))
7802     || ((flags & ANYOF_LOC_FOLD)
7803      && IN_UTF8_CTYPE_LOCALE
7804      && ARG(n) != ANYOF_NONBITMAP_EMPTY))
7805   {
7806    SV* only_utf8_locale = NULL;
7807    SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
7808                &only_utf8_locale);
7809    if (sw) {
7810     U8 * utf8_p;
7811     if (utf8_target) {
7812      utf8_p = (U8 *) p;
7813     } else { /* Convert to utf8 */
7814      STRLEN len = 1;
7815      utf8_p = bytes_to_utf8(p, &len);
7816     }
7817
7818     if (swash_fetch(sw, utf8_p, TRUE)) {
7819      match = TRUE;
7820     }
7821
7822     /* If we allocated a string above, free it */
7823     if (! utf8_target) Safefree(utf8_p);
7824    }
7825    if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
7826     match = _invlist_contains_cp(only_utf8_locale, c);
7827    }
7828   }
7829
7830   if (UNICODE_IS_SUPER(c)
7831    && (flags & ANYOF_WARN_SUPER)
7832    && ckWARN_d(WARN_NON_UNICODE))
7833   {
7834    Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7835     "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
7836   }
7837  }
7838
7839 #if ANYOF_INVERT != 1
7840  /* Depending on compiler optimization cBOOL takes time, so if don't have to
7841  * use it, don't */
7842 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
7843 #endif
7844
7845  /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7846  return (flags & ANYOF_INVERT) ^ match;
7847 }
7848
7849 STATIC U8 *
7850 S_reghop3(U8 *s, SSize_t off, const U8* lim)
7851 {
7852  /* return the position 'off' UTF-8 characters away from 's', forward if
7853  * 'off' >= 0, backwards if negative.  But don't go outside of position
7854  * 'lim', which better be < s  if off < 0 */
7855
7856  dVAR;
7857
7858  PERL_ARGS_ASSERT_REGHOP3;
7859
7860  if (off >= 0) {
7861   while (off-- && s < lim) {
7862    /* XXX could check well-formedness here */
7863    s += UTF8SKIP(s);
7864   }
7865  }
7866  else {
7867   while (off++ && s > lim) {
7868    s--;
7869    if (UTF8_IS_CONTINUED(*s)) {
7870     while (s > lim && UTF8_IS_CONTINUATION(*s))
7871      s--;
7872    }
7873    /* XXX could check well-formedness here */
7874   }
7875  }
7876  return s;
7877 }
7878
7879 STATIC U8 *
7880 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
7881 {
7882  dVAR;
7883
7884  PERL_ARGS_ASSERT_REGHOP4;
7885
7886  if (off >= 0) {
7887   while (off-- && s < rlim) {
7888    /* XXX could check well-formedness here */
7889    s += UTF8SKIP(s);
7890   }
7891  }
7892  else {
7893   while (off++ && s > llim) {
7894    s--;
7895    if (UTF8_IS_CONTINUED(*s)) {
7896     while (s > llim && UTF8_IS_CONTINUATION(*s))
7897      s--;
7898    }
7899    /* XXX could check well-formedness here */
7900   }
7901  }
7902  return s;
7903 }
7904
7905 /* like reghop3, but returns NULL on overrun, rather than returning last
7906  * char pos */
7907
7908 STATIC U8 *
7909 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
7910 {
7911  dVAR;
7912
7913  PERL_ARGS_ASSERT_REGHOPMAYBE3;
7914
7915  if (off >= 0) {
7916   while (off-- && s < lim) {
7917    /* XXX could check well-formedness here */
7918    s += UTF8SKIP(s);
7919   }
7920   if (off >= 0)
7921    return NULL;
7922  }
7923  else {
7924   while (off++ && s > lim) {
7925    s--;
7926    if (UTF8_IS_CONTINUED(*s)) {
7927     while (s > lim && UTF8_IS_CONTINUATION(*s))
7928      s--;
7929    }
7930    /* XXX could check well-formedness here */
7931   }
7932   if (off <= 0)
7933    return NULL;
7934  }
7935  return s;
7936 }
7937
7938
7939 /* when executing a regex that may have (?{}), extra stuff needs setting
7940    up that will be visible to the called code, even before the current
7941    match has finished. In particular:
7942
7943    * $_ is localised to the SV currently being matched;
7944    * pos($_) is created if necessary, ready to be updated on each call-out
7945  to code;
7946    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7947  isn't set until the current pattern is successfully finished), so that
7948  $1 etc of the match-so-far can be seen;
7949    * save the old values of subbeg etc of the current regex, and  set then
7950  to the current string (again, this is normally only done at the end
7951  of execution)
7952 */
7953
7954 static void
7955 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7956 {
7957  MAGIC *mg;
7958  regexp *const rex = ReANY(reginfo->prog);
7959  regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7960
7961  eval_state->rex = rex;
7962
7963  if (reginfo->sv) {
7964   /* Make $_ available to executed code. */
7965   if (reginfo->sv != DEFSV) {
7966    SAVE_DEFSV;
7967    DEFSV_set(reginfo->sv);
7968   }
7969
7970   if (!(mg = mg_find_mglob(reginfo->sv))) {
7971    /* prepare for quick setting of pos */
7972    mg = sv_magicext_mglob(reginfo->sv);
7973    mg->mg_len = -1;
7974   }
7975   eval_state->pos_magic = mg;
7976   eval_state->pos       = mg->mg_len;
7977   eval_state->pos_flags = mg->mg_flags;
7978  }
7979  else
7980   eval_state->pos_magic = NULL;
7981
7982  if (!PL_reg_curpm) {
7983   /* PL_reg_curpm is a fake PMOP that we can attach the current
7984   * regex to and point PL_curpm at, so that $1 et al are visible
7985   * within a /(?{})/. It's just allocated once per interpreter the
7986   * first time its needed */
7987   Newxz(PL_reg_curpm, 1, PMOP);
7988 #ifdef USE_ITHREADS
7989   {
7990    SV* const repointer = &PL_sv_undef;
7991    /* this regexp is also owned by the new PL_reg_curpm, which
7992    will try to free it.  */
7993    av_push(PL_regex_padav, repointer);
7994    PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
7995    PL_regex_pad = AvARRAY(PL_regex_padav);
7996   }
7997 #endif
7998  }
7999  SET_reg_curpm(reginfo->prog);
8000  eval_state->curpm = PL_curpm;
8001  PL_curpm = PL_reg_curpm;
8002  if (RXp_MATCH_COPIED(rex)) {
8003   /*  Here is a serious problem: we cannot rewrite subbeg,
8004    since it may be needed if this match fails.  Thus
8005    $` inside (?{}) could fail... */
8006   eval_state->subbeg     = rex->subbeg;
8007   eval_state->sublen     = rex->sublen;
8008   eval_state->suboffset  = rex->suboffset;
8009   eval_state->subcoffset = rex->subcoffset;
8010 #ifdef PERL_ANY_COW
8011   eval_state->saved_copy = rex->saved_copy;
8012 #endif
8013   RXp_MATCH_COPIED_off(rex);
8014  }
8015  else
8016   eval_state->subbeg = NULL;
8017  rex->subbeg = (char *)reginfo->strbeg;
8018  rex->suboffset = 0;
8019  rex->subcoffset = 0;
8020  rex->sublen = reginfo->strend - reginfo->strbeg;
8021 }
8022
8023
8024 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
8025
8026 static void
8027 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
8028 {
8029  dVAR;
8030  regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8031  regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
8032  regmatch_slab *s;
8033
8034  Safefree(aux->poscache);
8035
8036  if (eval_state) {
8037
8038   /* undo the effects of S_setup_eval_state() */
8039
8040   if (eval_state->subbeg) {
8041    regexp * const rex = eval_state->rex;
8042    rex->subbeg     = eval_state->subbeg;
8043    rex->sublen     = eval_state->sublen;
8044    rex->suboffset  = eval_state->suboffset;
8045    rex->subcoffset = eval_state->subcoffset;
8046 #ifdef PERL_ANY_COW
8047    rex->saved_copy = eval_state->saved_copy;
8048 #endif
8049    RXp_MATCH_COPIED_on(rex);
8050   }
8051   if (eval_state->pos_magic)
8052   {
8053    eval_state->pos_magic->mg_len = eval_state->pos;
8054    eval_state->pos_magic->mg_flags =
8055     (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8056    | (eval_state->pos_flags & MGf_BYTES);
8057   }
8058
8059   PL_curpm = eval_state->curpm;
8060  }
8061
8062  PL_regmatch_state = aux->old_regmatch_state;
8063  PL_regmatch_slab  = aux->old_regmatch_slab;
8064
8065  /* free all slabs above current one - this must be the last action
8066  * of this function, as aux and eval_state are allocated within
8067  * slabs and may be freed here */
8068
8069  s = PL_regmatch_slab->next;
8070  if (s) {
8071   PL_regmatch_slab->next = NULL;
8072   while (s) {
8073    regmatch_slab * const osl = s;
8074    s = s->next;
8075    Safefree(osl);
8076   }
8077  }
8078 }
8079
8080
8081 STATIC void
8082 S_to_utf8_substr(pTHX_ regexp *prog)
8083 {
8084  /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8085  * on the converted value */
8086
8087  int i = 1;
8088
8089  PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8090
8091  do {
8092   if (prog->substrs->data[i].substr
8093    && !prog->substrs->data[i].utf8_substr) {
8094    SV* const sv = newSVsv(prog->substrs->data[i].substr);
8095    prog->substrs->data[i].utf8_substr = sv;
8096    sv_utf8_upgrade(sv);
8097    if (SvVALID(prog->substrs->data[i].substr)) {
8098     if (SvTAIL(prog->substrs->data[i].substr)) {
8099      /* Trim the trailing \n that fbm_compile added last
8100      time.  */
8101      SvCUR_set(sv, SvCUR(sv) - 1);
8102      /* Whilst this makes the SV technically "invalid" (as its
8103      buffer is no longer followed by "\0") when fbm_compile()
8104      adds the "\n" back, a "\0" is restored.  */
8105      fbm_compile(sv, FBMcf_TAIL);
8106     } else
8107      fbm_compile(sv, 0);
8108    }
8109    if (prog->substrs->data[i].substr == prog->check_substr)
8110     prog->check_utf8 = sv;
8111   }
8112  } while (i--);
8113 }
8114
8115 STATIC bool
8116 S_to_byte_substr(pTHX_ regexp *prog)
8117 {
8118  /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8119  * on the converted value; returns FALSE if can't be converted. */
8120
8121  dVAR;
8122  int i = 1;
8123
8124  PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8125
8126  do {
8127   if (prog->substrs->data[i].utf8_substr
8128    && !prog->substrs->data[i].substr) {
8129    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
8130    if (! sv_utf8_downgrade(sv, TRUE)) {
8131     return FALSE;
8132    }
8133    if (SvVALID(prog->substrs->data[i].utf8_substr)) {
8134     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
8135      /* Trim the trailing \n that fbm_compile added last
8136       time.  */
8137      SvCUR_set(sv, SvCUR(sv) - 1);
8138      fbm_compile(sv, FBMcf_TAIL);
8139     } else
8140      fbm_compile(sv, 0);
8141    }
8142    prog->substrs->data[i].substr = sv;
8143    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
8144     prog->check_substr = sv;
8145   }
8146  } while (i--);
8147
8148  return TRUE;
8149 }
8150
8151 /*
8152  * Local variables:
8153  * c-indentation-style: bsd
8154  * c-basic-offset: 4
8155  * indent-tabs-mode: nil
8156  * End:
8157  *
8158  * ex: set ts=8 sts=4 sw=4 et:
8159  */