src/5023000/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *  One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 #define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
  41  "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
  42
  43 /*
  44  * pregcomp and pregexec -- regsub and regerror are not used in perl
  45  *
  46  * Copyright (c) 1986 by University of Toronto.
  47  * Written by Henry Spencer.  Not derived from licensed software.
  48  *
  49  * Permission is granted to anyone to use this software for any
  50  * purpose on any computer system, and to redistribute it freely,
  51  * subject to the following restrictions:
  52  *
  53  * 1. The author is not responsible for the consequences of use of
  54  *  this software, no matter how awful, even if they arise
  55  *  from defects in it.
  56  *
  57  * 2. The origin of this software must not be misrepresented, either
  58  *  by explicit claim or by omission.
  59  *
  60  * 3. Altered versions must be plainly marked as such, and must not
  61  *  be misrepresented as being the original software.
  62  *
  63  ****    Alterations to Henry's code are...
  64  ****
  65  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  66  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  67  ****    by Larry Wall and others
  68  ****
  69  ****    You may distribute under the terms of either the GNU General Public
  70  ****    License or the Artistic License, as specified in the README file.
  71  *
  72  * Beware that some of this code is subtly aware of the way operator
  73  * precedence is structured in regular expressions.  Serious changes in
  74  * regular-expression syntax might require a total rethink.
  75  */
  76 #include "EXTERN.h"
  77 #define PERL_IN_REGEXEC_C
  78 #undef PERL_IN_XSUB_RE
  79 #define PERL_IN_XSUB_RE 1
  80 #include "perl.h"
  81 #include "re_defs.h"
  82 #undef PERL_IN_XSUB_RE
  83
  84 #ifdef PERL_IN_XSUB_RE
  85 #  include "re_comp.h"
  86 #else
  87 #  include "regcomp.h"
  88 #endif
  89
  90 #include "inline_invlist.c"
  91 #include "unicode_constants.h"
  92
  93 #ifdef DEBUGGING
  94 /* At least one required character in the target string is expressible only in
  95  * UTF-8. */
  96 static const char* const non_utf8_target_but_utf8_required
  97     = "Can't match, because target string needs to be in UTF-8\n";
  98 #endif
  99
 100 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
 101  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
 102  goto target; \
 103 } STMT_END
 104
 105 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
 106
 107 #ifndef STATIC
 108 #define STATIC static
 109 #endif
 110
 111 /* Valid only for non-utf8 strings: avoids the reginclass
 112  * call if there are no complications: i.e., if everything matchable is
 113  * straight forward in the bitmap */
 114 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0)   \
 115            : ANYOF_BITMAP_TEST(p,*(c)))
 116
 117 /*
 118  * Forwards.
 119  */
 120
 121 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
 122 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
 123
 124 #define HOPc(pos,off) \
 125   (char *)(reginfo->is_utf8_target \
 126    ? reghop3((U8*)pos, off, \
 127      (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
 128    : (U8*)(pos + off))
 129
 130 #define HOPBACKc(pos, off) \
 131   (char*)(reginfo->is_utf8_target \
 132    ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
 133    : (pos - off >= reginfo->strbeg) \
 134     ? (U8*)pos - off  \
 135     : NULL)
 136
 137 #define HOP3(pos,off,lim) (reginfo->is_utf8_target  ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 138 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 139
 140 /* lim must be +ve. Returns NULL on overshoot */
 141 #define HOPMAYBE3(pos,off,lim) \
 142   (reginfo->is_utf8_target                        \
 143    ? reghopmaybe3((U8*)pos, off, (U8*)(lim))   \
 144    : ((U8*)pos + off <= lim)                   \
 145     ? (U8*)pos + off                        \
 146     : NULL)
 147
 148 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
 149  * off must be >=0; args should be vars rather than expressions */
 150 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
 151  ? reghop3((U8*)(pos), off, (U8*)(lim)) \
 152  : (U8*)((pos + off) > lim ? lim : (pos + off)))
 153
 154 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
 155  ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
 156  : (U8*)(pos + off))
 157 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
 158
 159 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
 160 #define NEXTCHR_IS_EOS (nextchr < 0)
 161
 162 #define SET_nextchr \
 163  nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
 164
 165 #define SET_locinput(p) \
 166  locinput = (p);  \
 167  SET_nextchr
 168
 169
 170 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START {   \
 171   if (!swash_ptr) {                                                     \
 172    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;                       \
 173    swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
 174           1, 0, invlist, &flags);              \
 175    assert(swash_ptr);                                                \
 176   }                                                                     \
 177  } STMT_END
 178
 179 /* If in debug mode, we test that a known character properly matches */
 180 #ifdef DEBUGGING
 181 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 182           property_name,                      \
 183           invlist,                            \
 184           utf8_char_in_property)              \
 185   LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist);               \
 186   assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
 187 #else
 188 #   define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr,                          \
 189           property_name,                      \
 190           invlist,                            \
 191           utf8_char_in_property)              \
 192   LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
 193 #endif
 194
 195 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST(           \
 196           PL_utf8_swash_ptrs[_CC_WORDCHAR],     \
 197           "",                                   \
 198           PL_XPosix_ptrs[_CC_WORDCHAR],         \
 199           LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
 200
 201 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
 202 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 203
 204 /* for use after a quantifier and before an EXACT-like node -- japhy */
 205 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
 206  *
 207  * NOTE that *nothing* that affects backtracking should be in here, specifically
 208  * VERBS must NOT be included. JUMPABLE is used to determine  if we can ignore a
 209  * node that is in between two EXACT like nodes when ascertaining what the required
 210  * "follow" character is. This should probably be moved to regex compile time
 211  * although it may be done at run time beause of the REF possibility - more
 212  * investigation required. -- demerphq
 213 */
 214 #define JUMPABLE(rn) (                                                             \
 215  OP(rn) == OPEN ||                                                              \
 216  (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 217  OP(rn) == EVAL ||                                                              \
 218  OP(rn) == SUSPEND || OP(rn) == IFMATCH ||                                      \
 219  OP(rn) == PLUS || OP(rn) == MINMOD ||                                          \
 220  OP(rn) == KEEPS ||                                                             \
 221  (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0)                                  \
 222 )
 223 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 224
 225 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 226
 227 #if 0
 228 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 229    we don't need this definition.  XXX These are now out-of-sync*/
 230 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 231 #define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF  || OP(rn)==NREFF )
 232 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 233
 234 #else
 235 /* ... so we use this as its faster. */
 236 #define IS_TEXT(rn)   ( OP(rn)==EXACT || OP(rn)==EXACTL )
 237 #define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
 238 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 239 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 240
 241 #endif
 242
 243 /*
 244   Search for mandatory following text node; for lookahead, the text must
 245   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 246 */
 247 #define FIND_NEXT_IMPT(rn) STMT_START {                                   \
 248  while (JUMPABLE(rn)) { \
 249   const OPCODE type = OP(rn); \
 250   if (type == SUSPEND || PL_regkind[type] == CURLY) \
 251    rn = NEXTOPER(NEXTOPER(rn)); \
 252   else if (type == PLUS) \
 253    rn = NEXTOPER(rn); \
 254   else if (type == IFMATCH) \
 255    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 256   else rn += NEXT_OFF(rn); \
 257  } \
 258 } STMT_END
 259
 260 #define SLAB_FIRST(s) (&(s)->states[0])
 261 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
 262
 263 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
 264 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
 265 static regmatch_state * S_push_slab(pTHX);
 266
 267 #define REGCP_PAREN_ELEMS 3
 268 #define REGCP_OTHER_ELEMS 3
 269 #define REGCP_FRAME_ELEMS 1
 270 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
 271  * are needed for the regexp context stack bookkeeping. */
 272
 273 STATIC CHECKPOINT
 274 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
 275 {
 276  const int retval = PL_savestack_ix;
 277  const int paren_elems_to_push =
 278     (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
 279  const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
 280  const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
 281  I32 p;
 282  GET_RE_DEBUG_FLAGS_DECL;
 283
 284  PERL_ARGS_ASSERT_REGCPPUSH;
 285
 286  if (paren_elems_to_push < 0)
 287   Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
 288     (int)paren_elems_to_push, (int)maxopenparen,
 289     (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
 290
 291  if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
 292   Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
 293     " out of range (%lu-%ld)",
 294     total_elems,
 295     (unsigned long)maxopenparen,
 296     (long)parenfloor);
 297
 298  SSGROW(total_elems + REGCP_FRAME_ELEMS);
 299
 300  DEBUG_BUFFERS_r(
 301   if ((int)maxopenparen > (int)parenfloor)
 302    PerlIO_printf(Perl_debug_log,
 303     "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
 304     PTR2UV(rex),
 305     PTR2UV(rex->offs)
 306    );
 307  );
 308  for (p = parenfloor+1; p <= (I32)maxopenparen;  p++) {
 309 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 310   SSPUSHIV(rex->offs[p].end);
 311   SSPUSHIV(rex->offs[p].start);
 312   SSPUSHINT(rex->offs[p].start_tmp);
 313   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 314    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
 315    (UV)p,
 316    (IV)rex->offs[p].start,
 317    (IV)rex->offs[p].start_tmp,
 318    (IV)rex->offs[p].end
 319   ));
 320  }
 321 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 322  SSPUSHINT(maxopenparen);
 323  SSPUSHINT(rex->lastparen);
 324  SSPUSHINT(rex->lastcloseparen);
 325  SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
 326
 327  return retval;
 328 }
 329
 330 /* These are needed since we do not localize EVAL nodes: */
 331 #define REGCP_SET(cp)                                           \
 332  DEBUG_STATE_r(                                              \
 333    PerlIO_printf(Perl_debug_log,          \
 334     "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 335     (IV)PL_savestack_ix));                          \
 336  cp = PL_savestack_ix
 337
 338 #define REGCP_UNWIND(cp)                                        \
 339  DEBUG_STATE_r(                                              \
 340   if (cp != PL_savestack_ix)                   \
 341     PerlIO_printf(Perl_debug_log,          \
 342     "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 343     (IV)(cp), (IV)PL_savestack_ix));                \
 344  regcpblow(cp)
 345
 346 #define UNWIND_PAREN(lp, lcp)               \
 347  for (n = rex->lastparen; n > lp; n--)   \
 348   rex->offs[n].end = -1;              \
 349  rex->lastparen = n;                     \
 350  rex->lastcloseparen = lcp;
 351
 352
 353 STATIC void
 354 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
 355 {
 356  UV i;
 357  U32 paren;
 358  GET_RE_DEBUG_FLAGS_DECL;
 359
 360  PERL_ARGS_ASSERT_REGCPPOP;
 361
 362  /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 363  i = SSPOPUV;
 364  assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 365  i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
 366  rex->lastcloseparen = SSPOPINT;
 367  rex->lastparen = SSPOPINT;
 368  *maxopenparen_p = SSPOPINT;
 369
 370  i -= REGCP_OTHER_ELEMS;
 371  /* Now restore the parentheses context. */
 372  DEBUG_BUFFERS_r(
 373   if (i || rex->lastparen + 1 <= rex->nparens)
 374    PerlIO_printf(Perl_debug_log,
 375     "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
 376     PTR2UV(rex),
 377     PTR2UV(rex->offs)
 378    );
 379  );
 380  paren = *maxopenparen_p;
 381  for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
 382   SSize_t tmps;
 383   rex->offs[paren].start_tmp = SSPOPINT;
 384   rex->offs[paren].start = SSPOPIV;
 385   tmps = SSPOPIV;
 386   if (paren <= rex->lastparen)
 387    rex->offs[paren].end = tmps;
 388   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 389    "    \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 390    (UV)paren,
 391    (IV)rex->offs[paren].start,
 392    (IV)rex->offs[paren].start_tmp,
 393    (IV)rex->offs[paren].end,
 394    (paren > rex->lastparen ? "(skipped)" : ""));
 395   );
 396   paren--;
 397  }
 398 #if 1
 399  /* It would seem that the similar code in regtry()
 400  * already takes care of this, and in fact it is in
 401  * a better location to since this code can #if 0-ed out
 402  * but the code in regtry() is needed or otherwise tests
 403  * requiring null fields (pat.t#187 and split.t#{13,14}
 404  * (as of patchlevel 7877)  will fail.  Then again,
 405  * this code seems to be necessary or otherwise
 406  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 407  * --jhi updated by dapm */
 408  for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
 409   if (i > *maxopenparen_p)
 410    rex->offs[i].start = -1;
 411   rex->offs[i].end = -1;
 412   DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
 413    "    \\%"UVuf": %s   ..-1 undeffing\n",
 414    (UV)i,
 415    (i > *maxopenparen_p) ? "-1" : "  "
 416   ));
 417  }
 418 #endif
 419 }
 420
 421 /* restore the parens and associated vars at savestack position ix,
 422  * but without popping the stack */
 423
 424 STATIC void
 425 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
 426 {
 427  I32 tmpix = PL_savestack_ix;
 428  PL_savestack_ix = ix;
 429  regcppop(rex, maxopenparen_p);
 430  PL_savestack_ix = tmpix;
 431 }
 432
 433 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
 434
 435 STATIC bool
 436 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
 437 {
 438  /* Returns a boolean as to whether or not 'character' is a member of the
 439  * Posix character class given by 'classnum' that should be equivalent to a
 440  * value in the typedef '_char_class_number'.
 441  *
 442  * Ideally this could be replaced by a just an array of function pointers
 443  * to the C library functions that implement the macros this calls.
 444  * However, to compile, the precise function signatures are required, and
 445  * these may vary from platform to to platform.  To avoid having to figure
 446  * out what those all are on each platform, I (khw) am using this method,
 447  * which adds an extra layer of function call overhead (unless the C
 448  * optimizer strips it away).  But we don't particularly care about
 449  * performance with locales anyway. */
 450
 451  switch ((_char_class_number) classnum) {
 452   case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
 453   case _CC_ENUM_ALPHA:     return isALPHA_LC(character);
 454   case _CC_ENUM_ASCII:     return isASCII_LC(character);
 455   case _CC_ENUM_BLANK:     return isBLANK_LC(character);
 456   case _CC_ENUM_CASED:     return isLOWER_LC(character)
 457           || isUPPER_LC(character);
 458   case _CC_ENUM_CNTRL:     return isCNTRL_LC(character);
 459   case _CC_ENUM_DIGIT:     return isDIGIT_LC(character);
 460   case _CC_ENUM_GRAPH:     return isGRAPH_LC(character);
 461   case _CC_ENUM_LOWER:     return isLOWER_LC(character);
 462   case _CC_ENUM_PRINT:     return isPRINT_LC(character);
 463   case _CC_ENUM_PUNCT:     return isPUNCT_LC(character);
 464   case _CC_ENUM_SPACE:     return isSPACE_LC(character);
 465   case _CC_ENUM_UPPER:     return isUPPER_LC(character);
 466   case _CC_ENUM_WORDCHAR:  return isWORDCHAR_LC(character);
 467   case _CC_ENUM_XDIGIT:    return isXDIGIT_LC(character);
 468   default:    /* VERTSPACE should never occur in locales */
 469    Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
 470  }
 471
 472  NOT_REACHED; /* NOTREACHED */
 473  return FALSE;
 474 }
 475
 476 STATIC bool
 477 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
 478 {
 479  /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
 480  * 'character' is a member of the Posix character class given by 'classnum'
 481  * that should be equivalent to a value in the typedef
 482  * '_char_class_number'.
 483  *
 484  * This just calls isFOO_lc on the code point for the character if it is in
 485  * the range 0-255.  Outside that range, all characters use Unicode
 486  * rules, ignoring any locale.  So use the Unicode function if this class
 487  * requires a swash, and use the Unicode macro otherwise. */
 488
 489  PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 490
 491  if (UTF8_IS_INVARIANT(*character)) {
 492   return isFOO_lc(classnum, *character);
 493  }
 494  else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
 495   return isFOO_lc(classnum,
 496       TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
 497  }
 498
 499  _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
 500
 501  if (classnum < _FIRST_NON_SWASH_CC) {
 502
 503   /* Initialize the swash unless done already */
 504   if (! PL_utf8_swash_ptrs[classnum]) {
 505    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
 506    PL_utf8_swash_ptrs[classnum] =
 507      _core_swash_init("utf8",
 508          "",
 509          &PL_sv_undef, 1, 0,
 510          PL_XPosix_ptrs[classnum], &flags);
 511   }
 512
 513   return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
 514         character,
 515         TRUE /* is UTF */ ));
 516  }
 517
 518  switch ((_char_class_number) classnum) {
 519   case _CC_ENUM_SPACE:     return is_XPERLSPACE_high(character);
 520   case _CC_ENUM_BLANK:     return is_HORIZWS_high(character);
 521   case _CC_ENUM_XDIGIT:    return is_XDIGIT_high(character);
 522   case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
 523   default:                 break;
 524  }
 525
 526  return FALSE; /* Things like CNTRL are always below 256 */
 527 }
 528
 529 /*
 530  * pregexec and friends
 531  */
 532
 533 #ifndef PERL_IN_XSUB_RE
 534 /*
 535  - pregexec - match a regexp against a string
 536  */
 537 I32
 538 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
 539   char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
 540 /* stringarg: the point in the string at which to begin matching */
 541 /* strend:    pointer to null at end of string */
 542 /* strbeg:    real beginning of string */
 543 /* minend:    end of match must be >= minend bytes after stringarg. */
 544 /* screamer:  SV being matched: only used for utf8 flag, pos() etc; string
 545  *            itself is accessed via the pointers above */
 546 /* nosave:    For optimizations. */
 547 {
 548  PERL_ARGS_ASSERT_PREGEXEC;
 549
 550  return
 551   regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 552      nosave ? 0 : REXEC_COPY_STR);
 553 }
 554 #endif
 555
 556
 557
 558 /* re_intuit_start():
 559  *
 560  * Based on some optimiser hints, try to find the earliest position in the
 561  * string where the regex could match.
 562  *
 563  *   rx:     the regex to match against
 564  *   sv:     the SV being matched: only used for utf8 flag; the string
 565  *           itself is accessed via the pointers below. Note that on
 566  *           something like an overloaded SV, SvPOK(sv) may be false
 567  *           and the string pointers may point to something unrelated to
 568  *           the SV itself.
 569  *   strbeg: real beginning of string
 570  *   strpos: the point in the string at which to begin matching
 571  *   strend: pointer to the byte following the last char of the string
 572  *   flags   currently unused; set to 0
 573  *   data:   currently unused; set to NULL
 574  *
 575  * The basic idea of re_intuit_start() is to use some known information
 576  * about the pattern, namely:
 577  *
 578  *   a) the longest known anchored substring (i.e. one that's at a
 579  *      constant offset from the beginning of the pattern; but not
 580  *      necessarily at a fixed offset from the beginning of the
 581  *      string);
 582  *   b) the longest floating substring (i.e. one that's not at a constant
 583  *      offset from the beginning of the pattern);
 584  *   c) Whether the pattern is anchored to the string; either
 585  *      an absolute anchor: /^../, or anchored to \n: /^.../m,
 586  *      or anchored to pos(): /\G/;
 587  *   d) A start class: a real or synthetic character class which
 588  *      represents which characters are legal at the start of the pattern;
 589  *
 590  * to either quickly reject the match, or to find the earliest position
 591  * within the string at which the pattern might match, thus avoiding
 592  * running the full NFA engine at those earlier locations, only to
 593  * eventually fail and retry further along.
 594  *
 595  * Returns NULL if the pattern can't match, or returns the address within
 596  * the string which is the earliest place the match could occur.
 597  *
 598  * The longest of the anchored and floating substrings is called 'check'
 599  * and is checked first. The other is called 'other' and is checked
 600  * second. The 'other' substring may not be present.  For example,
 601  *
 602  *    /(abc|xyz)ABC\d{0,3}DEFG/
 603  *
 604  * will have
 605  *
 606  *   check substr (float)    = "DEFG", offset 6..9 chars
 607  *   other substr (anchored) = "ABC",  offset 3..3 chars
 608  *   stclass = [ax]
 609  *
 610  * Be aware that during the course of this function, sometimes 'anchored'
 611  * refers to a substring being anchored relative to the start of the
 612  * pattern, and sometimes to the pattern itself being anchored relative to
 613  * the string. For example:
 614  *
 615  *   /\dabc/:   "abc" is anchored to the pattern;
 616  *   /^\dabc/:  "abc" is anchored to the pattern and the string;
 617  *   /\d+abc/:  "abc" is anchored to neither the pattern nor the string;
 618  *   /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
 619  *                    but the pattern is anchored to the string.
 620  */
 621
 622 char *
 623 Perl_re_intuit_start(pTHX_
 624      REGEXP * const rx,
 625      SV *sv,
 626      const char * const strbeg,
 627      char *strpos,
 628      char *strend,
 629      const U32 flags,
 630      re_scream_pos_data *data)
 631 {
 632  struct regexp *const prog = ReANY(rx);
 633  SSize_t start_shift = prog->check_offset_min;
 634  /* Should be nonnegative! */
 635  SSize_t end_shift   = 0;
 636  /* current lowest pos in string where the regex can start matching */
 637  char *rx_origin = strpos;
 638  SV *check;
 639  const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 640  U8   other_ix = 1 - prog->substrs->check_ix;
 641  bool ml_anch = 0;
 642  char *other_last = strpos;/* latest pos 'other' substr already checked to */
 643  char *check_at = NULL;  /* check substr found at this pos */
 644  const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 645  RXi_GET_DECL(prog,progi);
 646  regmatch_info reginfo_buf;  /* create some info to pass to find_byclass */
 647  regmatch_info *const reginfo = &reginfo_buf;
 648  GET_RE_DEBUG_FLAGS_DECL;
 649
 650  PERL_ARGS_ASSERT_RE_INTUIT_START;
 651  PERL_UNUSED_ARG(flags);
 652  PERL_UNUSED_ARG(data);
 653
 654  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 655     "Intuit: trying to determine minimum start position...\n"));
 656
 657  /* for now, assume that all substr offsets are positive. If at some point
 658  * in the future someone wants to do clever things with look-behind and
 659  * -ve offsets, they'll need to fix up any code in this function
 660  * which uses these offsets. See the thread beginning
 661  * <20140113145929.GF27210@iabyn.com>
 662  */
 663  assert(prog->substrs->data[0].min_offset >= 0);
 664  assert(prog->substrs->data[0].max_offset >= 0);
 665  assert(prog->substrs->data[1].min_offset >= 0);
 666  assert(prog->substrs->data[1].max_offset >= 0);
 667  assert(prog->substrs->data[2].min_offset >= 0);
 668  assert(prog->substrs->data[2].max_offset >= 0);
 669
 670  /* for now, assume that if both present, that the floating substring
 671  * doesn't start before the anchored substring.
 672  * If you break this assumption (e.g. doing better optimisations
 673  * with lookahead/behind), then you'll need to audit the code in this
 674  * function carefully first
 675  */
 676  assert(
 677    ! (  (prog->anchored_utf8 || prog->anchored_substr)
 678    && (prog->float_utf8    || prog->float_substr))
 679   || (prog->float_min_offset >= prog->anchored_offset));
 680
 681  /* byte rather than char calculation for efficiency. It fails
 682  * to quickly reject some cases that can't match, but will reject
 683  * them later after doing full char arithmetic */
 684  if (prog->minlen > strend - strpos) {
 685   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 686        "  String too short...\n"));
 687   goto fail;
 688  }
 689
 690  RX_MATCH_UTF8_set(rx,utf8_target);
 691  reginfo->is_utf8_target = cBOOL(utf8_target);
 692  reginfo->info_aux = NULL;
 693  reginfo->strbeg = strbeg;
 694  reginfo->strend = strend;
 695  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
 696  reginfo->intuit = 1;
 697  /* not actually used within intuit, but zero for safety anyway */
 698  reginfo->poscache_maxiter = 0;
 699
 700  if (utf8_target) {
 701   if (!prog->check_utf8 && prog->check_substr)
 702    to_utf8_substr(prog);
 703   check = prog->check_utf8;
 704  } else {
 705   if (!prog->check_substr && prog->check_utf8) {
 706    if (! to_byte_substr(prog)) {
 707     NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
 708    }
 709   }
 710   check = prog->check_substr;
 711  }
 712
 713  /* dump the various substring data */
 714  DEBUG_OPTIMISE_MORE_r({
 715   int i;
 716   for (i=0; i<=2; i++) {
 717    SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
 718         : prog->substrs->data[i].substr);
 719    if (!sv)
 720     continue;
 721
 722    PerlIO_printf(Perl_debug_log,
 723     "  substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
 724     " useful=%"IVdf" utf8=%d [%s]\n",
 725     i,
 726     (IV)prog->substrs->data[i].min_offset,
 727     (IV)prog->substrs->data[i].max_offset,
 728     (IV)prog->substrs->data[i].end_shift,
 729     BmUSEFUL(sv),
 730     utf8_target ? 1 : 0,
 731     SvPEEK(sv));
 732   }
 733  });
 734
 735  if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
 736
 737   /* ml_anch: check after \n?
 738   *
 739   * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
 740   * with /.*.../, these flags will have been added by the
 741   * compiler:
 742   *   /.*abc/, /.*abc/m:  PREGf_IMPLICIT | PREGf_ANCH_MBOL
 743   *   /.*abc/s:           PREGf_IMPLICIT | PREGf_ANCH_SBOL
 744   */
 745   ml_anch =      (prog->intflags & PREGf_ANCH_MBOL)
 746     && !(prog->intflags & PREGf_IMPLICIT);
 747
 748   if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
 749    /* we are only allowed to match at BOS or \G */
 750
 751    /* trivially reject if there's a BOS anchor and we're not at BOS.
 752    *
 753    * Note that we don't try to do a similar quick reject for
 754    * \G, since generally the caller will have calculated strpos
 755    * based on pos() and gofs, so the string is already correctly
 756    * anchored by definition; and handling the exceptions would
 757    * be too fiddly (e.g. REXEC_IGNOREPOS).
 758    */
 759    if (   strpos != strbeg
 760     && (prog->intflags & PREGf_ANCH_SBOL))
 761    {
 762     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 763         "  Not at start...\n"));
 764     goto fail;
 765    }
 766
 767    /* in the presence of an anchor, the anchored (relative to the
 768    * start of the regex) substr must also be anchored relative
 769    * to strpos. So quickly reject if substr isn't found there.
 770    * This works for \G too, because the caller will already have
 771    * subtracted gofs from pos, and gofs is the offset from the
 772    * \G to the start of the regex. For example, in /.abc\Gdef/,
 773    * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
 774    * caller will have set strpos=pos()-4; we look for the substr
 775    * at position pos()-4+1, which lines up with the "a" */
 776
 777    if (prog->check_offset_min == prog->check_offset_max) {
 778     /* Substring at constant offset from beg-of-str... */
 779     SSize_t slen = SvCUR(check);
 780     char *s = HOP3c(strpos, prog->check_offset_min, strend);
 781
 782     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 783      "  Looking for check substr at fixed offset %"IVdf"...\n",
 784      (IV)prog->check_offset_min));
 785
 786     if (SvTAIL(check)) {
 787      /* In this case, the regex is anchored at the end too.
 788      * Unless it's a multiline match, the lengths must match
 789      * exactly, give or take a \n.  NB: slen >= 1 since
 790      * the last char of check is \n */
 791      if (!multiline
 792       && (   strend - s > slen
 793        || strend - s < slen - 1
 794        || (strend - s == slen && strend[-1] != '\n')))
 795      {
 796       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 797            "  String too long...\n"));
 798       goto fail_finish;
 799      }
 800      /* Now should match s[0..slen-2] */
 801      slen--;
 802     }
 803     if (slen && (*SvPVX_const(check) != *s
 804      || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
 805     {
 806      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 807          "  String not equal...\n"));
 808      goto fail_finish;
 809     }
 810
 811     check_at = s;
 812     goto success_at_start;
 813    }
 814   }
 815  }
 816
 817  end_shift = prog->check_end_shift;
 818
 819 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
 820  if (end_shift < 0)
 821   Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 822     (IV)end_shift, RX_PRECOMP(prog));
 823 #endif
 824
 825   restart:
 826
 827  /* This is the (re)entry point of the main loop in this function.
 828  * The goal of this loop is to:
 829  * 1) find the "check" substring in the region rx_origin..strend
 830  *    (adjusted by start_shift / end_shift). If not found, reject
 831  *    immediately.
 832  * 2) If it exists, look for the "other" substr too if defined; for
 833  *    example, if the check substr maps to the anchored substr, then
 834  *    check the floating substr, and vice-versa. If not found, go
 835  *    back to (1) with rx_origin suitably incremented.
 836  * 3) If we find an rx_origin position that doesn't contradict
 837  *    either of the substrings, then check the possible additional
 838  *    constraints on rx_origin of /^.../m or a known start class.
 839  *    If these fail, then depending on which constraints fail, jump
 840  *    back to here, or to various other re-entry points further along
 841  *    that skip some of the first steps.
 842  * 4) If we pass all those tests, update the BmUSEFUL() count on the
 843  *    substring. If the start position was determined to be at the
 844  *    beginning of the string  - so, not rejected, but not optimised,
 845  *    since we have to run regmatch from position 0 - decrement the
 846  *    BmUSEFUL() count. Otherwise increment it.
 847  */
 848
 849
 850  /* first, look for the 'check' substring */
 851
 852  {
 853   U8* start_point;
 854   U8* end_point;
 855
 856   DEBUG_OPTIMISE_MORE_r({
 857    PerlIO_printf(Perl_debug_log,
 858     "  At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
 859     " Start shift: %"IVdf" End shift %"IVdf
 860     " Real end Shift: %"IVdf"\n",
 861     (IV)(rx_origin - strbeg),
 862     (IV)prog->check_offset_min,
 863     (IV)start_shift,
 864     (IV)end_shift,
 865     (IV)prog->check_end_shift);
 866   });
 867
 868   end_point = HOP3(strend, -end_shift, strbeg);
 869   start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
 870   if (!start_point)
 871    goto fail_finish;
 872
 873
 874   /* If the regex is absolutely anchored to either the start of the
 875   * string (SBOL) or to pos() (ANCH_GPOS), then
 876   * check_offset_max represents an upper bound on the string where
 877   * the substr could start. For the ANCH_GPOS case, we assume that
 878   * the caller of intuit will have already set strpos to
 879   * pos()-gofs, so in this case strpos + offset_max will still be
 880   * an upper bound on the substr.
 881   */
 882   if (!ml_anch
 883    && prog->intflags & PREGf_ANCH
 884    && prog->check_offset_max != SSize_t_MAX)
 885   {
 886    SSize_t len = SvCUR(check) - !!SvTAIL(check);
 887    const char * const anchor =
 888       (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
 889
 890    /* do a bytes rather than chars comparison. It's conservative;
 891    * so it skips doing the HOP if the result can't possibly end
 892    * up earlier than the old value of end_point.
 893    */
 894    if ((char*)end_point - anchor > prog->check_offset_max) {
 895     end_point = HOP3lim((U8*)anchor,
 896         prog->check_offset_max,
 897         end_point -len)
 898        + len;
 899    }
 900   }
 901
 902   check_at = fbm_instr( start_point, end_point,
 903      check, multiline ? FBMrf_MULTILINE : 0);
 904
 905   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 906    "  doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
 907    (IV)((char*)start_point - strbeg),
 908    (IV)((char*)end_point   - strbeg),
 909    (IV)(check_at ? check_at - strbeg : -1)
 910   ));
 911
 912   /* Update the count-of-usability, remove useless subpatterns,
 913    unshift s.  */
 914
 915   DEBUG_EXECUTE_r({
 916    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
 917     SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 918    PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s%s",
 919        (check_at ? "Found" : "Did not find"),
 920     (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
 921      ? "anchored" : "floating"),
 922     quoted,
 923     RE_SV_TAIL(check),
 924     (check_at ? " at offset " : "...\n") );
 925   });
 926
 927   if (!check_at)
 928    goto fail_finish;
 929   /* set rx_origin to the minimum position where the regex could start
 930   * matching, given the constraint of the just-matched check substring.
 931   * But don't set it lower than previously.
 932   */
 933
 934   if (check_at - rx_origin > prog->check_offset_max)
 935    rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
 936   /* Finish the diagnostic message */
 937   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 938    "%ld (rx_origin now %"IVdf")...\n",
 939    (long)(check_at - strbeg),
 940    (IV)(rx_origin - strbeg)
 941   ));
 942  }
 943
 944
 945  /* now look for the 'other' substring if defined */
 946
 947  if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
 948      : prog->substrs->data[other_ix].substr)
 949  {
 950   /* Take into account the "other" substring. */
 951   char *last, *last1;
 952   char *s;
 953   SV* must;
 954   struct reg_substr_datum *other;
 955
 956  do_other_substr:
 957   other = &prog->substrs->data[other_ix];
 958
 959   /* if "other" is anchored:
 960   * we've previously found a floating substr starting at check_at.
 961   * This means that the regex origin must lie somewhere
 962   * between min (rx_origin): HOP3(check_at, -check_offset_max)
 963   * and max:                 HOP3(check_at, -check_offset_min)
 964   * (except that min will be >= strpos)
 965   * So the fixed  substr must lie somewhere between
 966   *  HOP3(min, anchored_offset)
 967   *  HOP3(max, anchored_offset) + SvCUR(substr)
 968   */
 969
 970   /* if "other" is floating
 971   * Calculate last1, the absolute latest point where the
 972   * floating substr could start in the string, ignoring any
 973   * constraints from the earlier fixed match. It is calculated
 974   * as follows:
 975   *
 976   * strend - prog->minlen (in chars) is the absolute latest
 977   * position within the string where the origin of the regex
 978   * could appear. The latest start point for the floating
 979   * substr is float_min_offset(*) on from the start of the
 980   * regex.  last1 simply combines thee two offsets.
 981   *
 982   * (*) You might think the latest start point should be
 983   * float_max_offset from the regex origin, and technically
 984   * you'd be correct. However, consider
 985   *    /a\d{2,4}bcd\w/
 986   * Here, float min, max are 3,5 and minlen is 7.
 987   * This can match either
 988   *    /a\d\dbcd\w/
 989   *    /a\d\d\dbcd\w/
 990   *    /a\d\d\d\dbcd\w/
 991   * In the first case, the regex matches minlen chars; in the
 992   * second, minlen+1, in the third, minlen+2.
 993   * In the first case, the floating offset is 3 (which equals
 994   * float_min), in the second, 4, and in the third, 5 (which
 995   * equals float_max). In all cases, the floating string bcd
 996   * can never start more than 4 chars from the end of the
 997   * string, which equals minlen - float_min. As the substring
 998   * starts to match more than float_min from the start of the
 999   * regex, it makes the regex match more than minlen chars,
1000   * and the two cancel each other out. So we can always use
1001   * float_min - minlen, rather than float_max - minlen for the
1002   * latest position in the string.
1003   *
1004   * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1005   * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1006   */
1007
1008   assert(prog->minlen >= other->min_offset);
1009   last1 = HOP3c(strend,
1010       other->min_offset - prog->minlen, strbeg);
1011
1012   if (other_ix) {/* i.e. if (other-is-float) */
1013    /* last is the latest point where the floating substr could
1014    * start, *given* any constraints from the earlier fixed
1015    * match. This constraint is that the floating string starts
1016    * <= float_max_offset chars from the regex origin (rx_origin).
1017    * If this value is less than last1, use it instead.
1018    */
1019    assert(rx_origin <= last1);
1020    last =
1021     /* this condition handles the offset==infinity case, and
1022     * is a short-cut otherwise. Although it's comparing a
1023     * byte offset to a char length, it does so in a safe way,
1024     * since 1 char always occupies 1 or more bytes,
1025     * so if a string range is  (last1 - rx_origin) bytes,
1026     * it will be less than or equal to  (last1 - rx_origin)
1027     * chars; meaning it errs towards doing the accurate HOP3
1028     * rather than just using last1 as a short-cut */
1029     (last1 - rx_origin) < other->max_offset
1030      ? last1
1031      : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1032   }
1033   else {
1034    assert(strpos + start_shift <= check_at);
1035    last = HOP4c(check_at, other->min_offset - start_shift,
1036       strbeg, strend);
1037   }
1038
1039   s = HOP3c(rx_origin, other->min_offset, strend);
1040   if (s < other_last) /* These positions already checked */
1041    s = other_last;
1042
1043   must = utf8_target ? other->utf8_substr : other->substr;
1044   assert(SvPOK(must));
1045   {
1046    char *from = s;
1047    char *to   = last + SvCUR(must) - (SvTAIL(must)!=0);
1048
1049    if (from > to) {
1050     s = NULL;
1051     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1052      "  skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1053      (IV)(from - strbeg),
1054      (IV)(to   - strbeg)
1055     ));
1056    }
1057    else {
1058     s = fbm_instr(
1059      (unsigned char*)from,
1060      (unsigned char*)to,
1061      must,
1062      multiline ? FBMrf_MULTILINE : 0
1063     );
1064     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1065      "  doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1066      (IV)(from - strbeg),
1067      (IV)(to   - strbeg),
1068      (IV)(s ? s - strbeg : -1)
1069     ));
1070    }
1071   }
1072
1073   DEBUG_EXECUTE_r({
1074    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1075     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1076    PerlIO_printf(Perl_debug_log, "  %s %s substr %s%s",
1077     s ? "Found" : "Contradicts",
1078     other_ix ? "floating" : "anchored",
1079     quoted, RE_SV_TAIL(must));
1080   });
1081
1082
1083   if (!s) {
1084    /* last1 is latest possible substr location. If we didn't
1085    * find it before there, we never will */
1086    if (last >= last1) {
1087     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1088           "; giving up...\n"));
1089     goto fail_finish;
1090    }
1091
1092    /* try to find the check substr again at a later
1093    * position. Maybe next time we'll find the "other" substr
1094    * in range too */
1095    other_last = HOP3c(last, 1, strend) /* highest failure */;
1096    rx_origin =
1097     other_ix /* i.e. if other-is-float */
1098      ? HOP3c(rx_origin, 1, strend)
1099      : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1100    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1101     "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1102     (other_ix ? "floating" : "anchored"),
1103     (long)(HOP3c(check_at, 1, strend) - strbeg),
1104     (IV)(rx_origin - strbeg)
1105    ));
1106    goto restart;
1107   }
1108   else {
1109    if (other_ix) { /* if (other-is-float) */
1110     /* other_last is set to s, not s+1, since its possible for
1111     * a floating substr to fail first time, then succeed
1112     * second time at the same floating position; e.g.:
1113     *     "-AB--AABZ" =~ /\wAB\d*Z/
1114     * The first time round, anchored and float match at
1115     * "-(AB)--AAB(Z)" then fail on the initial \w character
1116     * class. Second time round, they match at "-AB--A(AB)(Z)".
1117     */
1118     other_last = s;
1119    }
1120    else {
1121     rx_origin = HOP3c(s, -other->min_offset, strbeg);
1122     other_last = HOP3c(s, 1, strend);
1123    }
1124    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1125     " at offset %ld (rx_origin now %"IVdf")...\n",
1126     (long)(s - strbeg),
1127     (IV)(rx_origin - strbeg)
1128    ));
1129
1130   }
1131  }
1132  else {
1133   DEBUG_OPTIMISE_MORE_r(
1134    PerlIO_printf(Perl_debug_log,
1135     "  Check-only match: offset min:%"IVdf" max:%"IVdf
1136     " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1137     " strend:%"IVdf"\n",
1138     (IV)prog->check_offset_min,
1139     (IV)prog->check_offset_max,
1140     (IV)(check_at-strbeg),
1141     (IV)(rx_origin-strbeg),
1142     (IV)(rx_origin-check_at),
1143     (IV)(strend-strbeg)
1144    )
1145   );
1146  }
1147
1148   postprocess_substr_matches:
1149
1150  /* handle the extra constraint of /^.../m if present */
1151
1152  if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1153   char *s;
1154
1155   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1156       "  looking for /^/m anchor"));
1157
1158   /* we have failed the constraint of a \n before rx_origin.
1159   * Find the next \n, if any, even if it's beyond the current
1160   * anchored and/or floating substrings. Whether we should be
1161   * scanning ahead for the next \n or the next substr is debatable.
1162   * On the one hand you'd expect rare substrings to appear less
1163   * often than \n's. On the other hand, searching for \n means
1164   * we're effectively flipping between check_substr and "\n" on each
1165   * iteration as the current "rarest" string candidate, which
1166   * means for example that we'll quickly reject the whole string if
1167   * hasn't got a \n, rather than trying every substr position
1168   * first
1169   */
1170
1171   s = HOP3c(strend, - prog->minlen, strpos);
1172   if (s <= rx_origin ||
1173    ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1174   {
1175    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1176        "  Did not find /%s^%s/m...\n",
1177        PL_colors[0], PL_colors[1]));
1178    goto fail_finish;
1179   }
1180
1181   /* earliest possible origin is 1 char after the \n.
1182   * (since *rx_origin == '\n', it's safe to ++ here rather than
1183   * HOP(rx_origin, 1)) */
1184   rx_origin++;
1185
1186   if (prog->substrs->check_ix == 0  /* check is anchored */
1187    || rx_origin >= HOP3c(check_at,  - prog->check_offset_min, strpos))
1188   {
1189    /* Position contradicts check-string; either because
1190    * check was anchored (and thus has no wiggle room),
1191    * or check was float and rx_origin is above the float range */
1192    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1193     "  Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1194     PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1195    goto restart;
1196   }
1197
1198   /* if we get here, the check substr must have been float,
1199   * is in range, and we may or may not have had an anchored
1200   * "other" substr which still contradicts */
1201   assert(prog->substrs->check_ix); /* check is float */
1202
1203   if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1204    /* whoops, the anchored "other" substr exists, so we still
1205    * contradict. On the other hand, the float "check" substr
1206    * didn't contradict, so just retry the anchored "other"
1207    * substr */
1208    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1209     "  Found /%s^%s/m, rescanning for anchored from offset %ld (rx_origin now %"IVdf")...\n",
1210     PL_colors[0], PL_colors[1],
1211     (long)(rx_origin - strbeg + prog->anchored_offset),
1212     (long)(rx_origin - strbeg)
1213    ));
1214    goto do_other_substr;
1215   }
1216
1217   /* success: we don't contradict the found floating substring
1218   * (and there's no anchored substr). */
1219   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1220    "  Found /%s^%s/m with rx_origin %ld...\n",
1221    PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1222  }
1223  else {
1224   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1225    "  (multiline anchor test skipped)\n"));
1226  }
1227
1228   success_at_start:
1229
1230
1231  /* if we have a starting character class, then test that extra constraint.
1232  * (trie stclasses are too expensive to use here, we are better off to
1233  * leave it to regmatch itself) */
1234
1235  if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1236   const U8* const str = (U8*)STRING(progi->regstclass);
1237
1238   /* XXX this value could be pre-computed */
1239   const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1240      ?  (reginfo->is_utf8_pat
1241       ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1242       : STR_LEN(progi->regstclass))
1243      : 1);
1244   char * endpos;
1245   char *s;
1246   /* latest pos that a matching float substr constrains rx start to */
1247   char *rx_max_float = NULL;
1248
1249   /* if the current rx_origin is anchored, either by satisfying an
1250   * anchored substring constraint, or a /^.../m constraint, then we
1251   * can reject the current origin if the start class isn't found
1252   * at the current position. If we have a float-only match, then
1253   * rx_origin is constrained to a range; so look for the start class
1254   * in that range. if neither, then look for the start class in the
1255   * whole rest of the string */
1256
1257   /* XXX DAPM it's not clear what the minlen test is for, and why
1258   * it's not used in the floating case. Nothing in the test suite
1259   * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1260   * Here are some old comments, which may or may not be correct:
1261   *
1262   *   minlen == 0 is possible if regstclass is \b or \B,
1263   *   and the fixed substr is ''$.
1264   *   Since minlen is already taken into account, rx_origin+1 is
1265   *   before strend; accidentally, minlen >= 1 guaranties no false
1266   *   positives at rx_origin + 1 even for \b or \B.  But (minlen? 1 :
1267   *   0) below assumes that regstclass does not come from lookahead...
1268   *   If regstclass takes bytelength more than 1: If charlength==1, OK.
1269   *   This leaves EXACTF-ish only, which are dealt with in
1270   *   find_byclass().
1271   */
1272
1273   if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1274    endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1275   else if (prog->float_substr || prog->float_utf8) {
1276    rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1277    endpos= HOP3c(rx_max_float, cl_l, strend);
1278   }
1279   else
1280    endpos= strend;
1281
1282   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1283    "  looking for class: start_shift: %"IVdf" check_at: %"IVdf
1284    " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1285    (IV)start_shift, (IV)(check_at - strbeg),
1286    (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1287
1288   s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1289        reginfo);
1290   if (!s) {
1291    if (endpos == strend) {
1292     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1293         "  Could not match STCLASS...\n") );
1294     goto fail;
1295    }
1296    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1297        "  This position contradicts STCLASS...\n") );
1298    if ((prog->intflags & PREGf_ANCH) && !ml_anch
1299       && !(prog->intflags & PREGf_IMPLICIT))
1300     goto fail;
1301
1302    /* Contradict one of substrings */
1303    if (prog->anchored_substr || prog->anchored_utf8) {
1304     if (prog->substrs->check_ix == 1) { /* check is float */
1305      /* Have both, check_string is floating */
1306      assert(rx_origin + start_shift <= check_at);
1307      if (rx_origin + start_shift != check_at) {
1308       /* not at latest position float substr could match:
1309       * Recheck anchored substring, but not floating.
1310       * The condition above is in bytes rather than
1311       * chars for efficiency. It's conservative, in
1312       * that it errs on the side of doing 'goto
1313       * do_other_substr'. In this case, at worst,
1314       * an extra anchored search may get done, but in
1315       * practice the extra fbm_instr() is likely to
1316       * get skipped anyway. */
1317       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1318        "  about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1319        (long)(other_last - strbeg),
1320        (IV)(rx_origin - strbeg)
1321       ));
1322       goto do_other_substr;
1323      }
1324     }
1325    }
1326    else {
1327     /* float-only */
1328
1329     if (ml_anch) {
1330      /* In the presence of ml_anch, we might be able to
1331      * find another \n without breaking the current float
1332      * constraint. */
1333
1334      /* strictly speaking this should be HOP3c(..., 1, ...),
1335      * but since we goto a block of code that's going to
1336      * search for the next \n if any, its safe here */
1337      rx_origin++;
1338      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1339        "  about to look for /%s^%s/m starting at rx_origin %ld...\n",
1340        PL_colors[0], PL_colors[1],
1341        (long)(rx_origin - strbeg)) );
1342      goto postprocess_substr_matches;
1343     }
1344
1345     /* strictly speaking this can never be true; but might
1346     * be if we ever allow intuit without substrings */
1347     if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1348      goto fail;
1349
1350     rx_origin = rx_max_float;
1351    }
1352
1353    /* at this point, any matching substrings have been
1354    * contradicted. Start again... */
1355
1356    rx_origin = HOP3c(rx_origin, 1, strend);
1357
1358    /* uses bytes rather than char calculations for efficiency.
1359    * It's conservative: it errs on the side of doing 'goto restart',
1360    * where there is code that does a proper char-based test */
1361    if (rx_origin + start_shift + end_shift > strend) {
1362     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1363          "  Could not match STCLASS...\n") );
1364     goto fail;
1365    }
1366    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1367     "  about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
1368     (prog->substrs->check_ix ? "floating" : "anchored"),
1369     (long)(rx_origin + start_shift - strbeg),
1370     (IV)(rx_origin - strbeg)
1371    ));
1372    goto restart;
1373   }
1374
1375   /* Success !!! */
1376
1377   if (rx_origin != s) {
1378    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1379       "  By STCLASS: moving %ld --> %ld\n",
1380         (long)(rx_origin - strbeg), (long)(s - strbeg))
1381     );
1382   }
1383   else {
1384    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1385         "  Does not contradict STCLASS...\n");
1386     );
1387   }
1388  }
1389
1390  /* Decide whether using the substrings helped */
1391
1392  if (rx_origin != strpos) {
1393   /* Fixed substring is found far enough so that the match
1394   cannot start at strpos. */
1395
1396   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  try at offset...\n"));
1397   ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1398  }
1399  else {
1400   /* The found rx_origin position does not prohibit matching at
1401   * strpos, so calling intuit didn't gain us anything. Decrement
1402   * the BmUSEFUL() count on the check substring, and if we reach
1403   * zero, free it.  */
1404   if (!(prog->intflags & PREGf_NAUGHTY)
1405    && (utf8_target ? (
1406     prog->check_utf8  /* Could be deleted already */
1407     && --BmUSEFUL(prog->check_utf8) < 0
1408     && (prog->check_utf8 == prog->float_utf8)
1409    ) : (
1410     prog->check_substr  /* Could be deleted already */
1411     && --BmUSEFUL(prog->check_substr) < 0
1412     && (prog->check_substr == prog->float_substr)
1413    )))
1414   {
1415    /* If flags & SOMETHING - do not do it many times on the same match */
1416    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "  ... Disabling check substring...\n"));
1417    /* XXX Does the destruction order has to change with utf8_target? */
1418    SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1419    SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1420    prog->check_substr = prog->check_utf8 = NULL; /* disable */
1421    prog->float_substr = prog->float_utf8 = NULL; /* clear */
1422    check = NULL;   /* abort */
1423    /* XXXX This is a remnant of the old implementation.  It
1424      looks wasteful, since now INTUIT can use many
1425      other heuristics. */
1426    prog->extflags &= ~RXf_USE_INTUIT;
1427   }
1428  }
1429
1430  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1431    "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1432    PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1433
1434  return rx_origin;
1435
1436   fail_finish:    /* Substring not found */
1437  if (prog->check_substr || prog->check_utf8)  /* could be removed already */
1438   BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1439   fail:
1440  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1441       PL_colors[4], PL_colors[5]));
1442  return NULL;
1443 }
1444
1445
1446 #define DECL_TRIE_TYPE(scan) \
1447  const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold,       \
1448     trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold,              \
1449     trie_utf8l, trie_flu8 }                                            \
1450      trie_type = ((scan->flags == EXACT)                             \
1451         ? (utf8_target ? trie_utf8 : trie_plain)           \
1452         : (scan->flags == EXACTL)                          \
1453          ? (utf8_target ? trie_utf8l : trie_plain)       \
1454          : (scan->flags == EXACTFA)                      \
1455          ? (utf8_target                                \
1456           ? trie_utf8_exactfa_fold                   \
1457           : trie_latin_utf8_exactfa_fold)            \
1458          : (scan->flags == EXACTFLU8                   \
1459           ? trie_flu8                                \
1460           : (utf8_target                             \
1461           ? trie_utf8_fold                         \
1462           :   trie_latin_utf8_fold)))
1463
1464 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1465 STMT_START {                                                                        \
1466  STRLEN skiplen;                                                                 \
1467  U8 flags = FOLD_FLAGS_FULL;                                                     \
1468  switch (trie_type) {                                                            \
1469  case trie_flu8:                                                                 \
1470   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1471   if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1472    _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1473   }                                                                           \
1474   goto do_trie_utf8_fold;                                                     \
1475  case trie_utf8_exactfa_fold:                                                    \
1476   flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1477   /* FALLTHROUGH */                                                           \
1478  case trie_utf8_fold:                                                            \
1479  do_trie_utf8_fold:                                                            \
1480   if ( foldlen>0 ) {                                                          \
1481    uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1482    foldlen -= len;                                                         \
1483    uscan += len;                                                           \
1484    len=0;                                                                  \
1485   } else {                                                                    \
1486    uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
1487    len = UTF8SKIP(uc);                                                     \
1488    skiplen = UNISKIP( uvc );                                               \
1489    foldlen -= skiplen;                                                     \
1490    uscan = foldbuf + skiplen;                                              \
1491   }                                                                           \
1492   break;                                                                      \
1493  case trie_latin_utf8_exactfa_fold:                                              \
1494   flags |= FOLD_FLAGS_NOMIX_ASCII;                                            \
1495   /* FALLTHROUGH */                                                           \
1496  case trie_latin_utf8_fold:                                                      \
1497   if ( foldlen>0 ) {                                                          \
1498    uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1499    foldlen -= len;                                                         \
1500    uscan += len;                                                           \
1501    len=0;                                                                  \
1502   } else {                                                                    \
1503    len = 1;                                                                \
1504    uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
1505    skiplen = UNISKIP( uvc );                                               \
1506    foldlen -= skiplen;                                                     \
1507    uscan = foldbuf + skiplen;                                              \
1508   }                                                                           \
1509   break;                                                                      \
1510  case trie_utf8l:                                                                \
1511   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;                                         \
1512   if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) {                             \
1513    _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc));          \
1514   }                                                                           \
1515   /* FALLTHROUGH */                                                           \
1516  case trie_utf8:                                                                 \
1517   uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );        \
1518   break;                                                                      \
1519  case trie_plain:                                                                \
1520   uvc = (UV)*uc;                                                              \
1521   len = 1;                                                                    \
1522  }                                                                               \
1523  if (uvc < 256) {                                                                \
1524   charid = trie->charmap[ uvc ];                                              \
1525  }                                                                               \
1526  else {                                                                          \
1527   charid = 0;                                                                 \
1528   if (widecharmap) {                                                          \
1529    SV** const svpp = hv_fetch(widecharmap,                                 \
1530       (char*)&uvc, sizeof(UV), 0);                                \
1531    if (svpp)                                                               \
1532     charid = (U16)SvIV(*svpp);                                          \
1533   }                                                                           \
1534  }                                                                               \
1535 } STMT_END
1536
1537 #define DUMP_EXEC_POS(li,s,doutf8)                          \
1538  dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1539     startpos, doutf8)
1540
1541 #define REXEC_FBC_EXACTISH_SCAN(COND)                     \
1542 STMT_START {                                              \
1543  while (s <= e) {                                      \
1544   if ( (COND)                                       \
1545    && (ln == 1 || folder(s, pat_string, ln))    \
1546    && (reginfo->intuit || regtry(reginfo, &s)) )\
1547    goto got_it;                                  \
1548   s++;                                              \
1549  }                                                     \
1550 } STMT_END
1551
1552 #define REXEC_FBC_UTF8_SCAN(CODE)                     \
1553 STMT_START {                                          \
1554  while (s < strend) {                              \
1555   CODE                                          \
1556   s += UTF8SKIP(s);                             \
1557  }                                                 \
1558 } STMT_END
1559
1560 #define REXEC_FBC_SCAN(CODE)                          \
1561 STMT_START {                                          \
1562  while (s < strend) {                              \
1563   CODE                                          \
1564   s++;                                          \
1565  }                                                 \
1566 } STMT_END
1567
1568 #define REXEC_FBC_UTF8_CLASS_SCAN(COND)                        \
1569 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */            \
1570  if (COND) {                                                \
1571   if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1572    goto got_it;                                       \
1573   else                                                   \
1574    tmp = doevery;                                     \
1575  }                                                          \
1576  else                                                       \
1577   tmp = 1;                                               \
1578 )
1579
1580 #define REXEC_FBC_CLASS_SCAN(COND)                             \
1581 REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
1582  if (COND) {                                                \
1583   if (tmp && (reginfo->intuit || regtry(reginfo, &s)))   \
1584    goto got_it;                                       \
1585   else                                                   \
1586    tmp = doevery;                                     \
1587  }                                                          \
1588  else                                                       \
1589   tmp = 1;                                               \
1590 )
1591
1592 #define REXEC_FBC_CSCAN(CONDUTF8,COND)                         \
1593  if (utf8_target) {                                         \
1594   REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8);                   \
1595  }                                                          \
1596  else {                                                     \
1597   REXEC_FBC_CLASS_SCAN(COND);                            \
1598  }
1599
1600 /* The three macros below are slightly different versions of the same logic.
1601  *
1602  * The first is for /a and /aa when the target string is UTF-8.  This can only
1603  * match ascii, but it must advance based on UTF-8.   The other two handle the
1604  * non-UTF-8 and the more generic UTF-8 cases.   In all three, we are looking
1605  * for the boundary (or non-boundary) between a word and non-word character.
1606  * The utf8 and non-utf8 cases have the same logic, but the details must be
1607  * different.  Find the "wordness" of the character just prior to this one, and
1608  * compare it with the wordness of this one.  If they differ, we have a
1609  * boundary.  At the beginning of the string, pretend that the previous
1610  * character was a new-line.
1611  *
1612  * All these macros uncleanly have side-effects with each other and outside
1613  * variables.  So far it's been too much trouble to clean-up
1614  *
1615  * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1616  *               a word character or not.
1617  * IF_SUCCESS    is code to do if it finds that we are at a boundary between
1618  *               word/non-word
1619  * IF_FAIL       is code to do if we aren't at a boundary between word/non-word
1620  *
1621  * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1622  * are looking for a boundary or for a non-boundary.  If we are looking for a
1623  * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1624  * see if this tentative match actually works, and if so, to quit the loop
1625  * here.  And vice-versa if we are looking for a non-boundary.
1626  *
1627  * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1628  * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1629  * TEST_NON_UTF8(s-1).  To see this, note that that's what it is defined to be
1630  * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1631  * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1632  * complement.  But in that branch we complement tmp, meaning that at the
1633  * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1634  * which means at the top of the loop in the next iteration, it is
1635  * TEST_NON_UTF8(s-1) */
1636 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)                         \
1637  tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                      \
1638  tmp = TEST_NON_UTF8(tmp);                                                  \
1639  REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1640   if (tmp == ! TEST_NON_UTF8((U8) *s)) {                                 \
1641    tmp = !tmp;                                                        \
1642    IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */     \
1643   }                                                                      \
1644   else {                                                                 \
1645    IF_FAIL;                                                           \
1646   }                                                                      \
1647  );                                                                         \
1648
1649 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1650  * TEST_UTF8 is a macro that for the same input code points returns identically
1651  * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1652 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL)                      \
1653  if (s == reginfo->strbeg) {                                                \
1654   tmp = '\n';                                                            \
1655  }                                                                          \
1656  else { /* Back-up to the start of the previous character */                \
1657   U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg);              \
1658   tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,                     \
1659              0, UTF8_ALLOW_DEFAULT); \
1660  }                                                                          \
1661  tmp = TEST_UV(tmp);                                                        \
1662  LOAD_UTF8_CHARCLASS_ALNUM();                                               \
1663  REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
1664   if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
1665    tmp = !tmp;                                                        \
1666    IF_SUCCESS;                                                        \
1667   }                                                                      \
1668   else {                                                                 \
1669    IF_FAIL;                                                           \
1670   }                                                                      \
1671  );
1672
1673 /* Like the above two macros.  UTF8_CODE is the complete code for handling
1674  * UTF-8.  Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1675  * macros below */
1676 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL)        \
1677  if (utf8_target) {                                                         \
1678   UTF8_CODE                                                              \
1679  }                                                                          \
1680  else {  /* Not utf8 */                                                     \
1681   tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n';                  \
1682   tmp = TEST_NON_UTF8(tmp);                                              \
1683   REXEC_FBC_SCAN( /* advances s while s < strend */                      \
1684    if (tmp == ! TEST_NON_UTF8((U8) *s)) {                             \
1685     IF_SUCCESS;                                                    \
1686     tmp = !tmp;                                                    \
1687    }                                                                  \
1688    else {                                                             \
1689     IF_FAIL;                                                       \
1690    }                                                                  \
1691   );                                                                     \
1692  }                                                                          \
1693  /* Here, things have been set up by the previous code so that tmp is the   \
1694  * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the         \
1695  * utf8ness of the target).  We also have to check if this matches against \
1696  * the EOS, which we treat as a \n (which is the same value in both UTF-8  \
1697  * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8    \
1698  * string */                                                               \
1699  if (tmp == ! TEST_NON_UTF8('\n')) {                                        \
1700   IF_SUCCESS;                                                            \
1701  }                                                                          \
1702  else {                                                                     \
1703   IF_FAIL;                                                               \
1704  }
1705
1706 /* This is the macro to use when we want to see if something that looks like it
1707  * could match, actually does, and if so exits the loop */
1708 #define REXEC_FBC_TRYIT                            \
1709  if ((reginfo->intuit || regtry(reginfo, &s)))  \
1710   goto got_it
1711
1712 /* The only difference between the BOUND and NBOUND cases is that
1713  * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1714  * NBOUND.  This is accomplished by passing it as either the if or else clause,
1715  * with the other one being empty (PLACEHOLDER is defined as empty).
1716  *
1717  * The TEST_FOO parameters are for operating on different forms of input, but
1718  * all should be ones that return identically for the same underlying code
1719  * points */
1720 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                           \
1721  FBC_BOUND_COMMON(                                                          \
1722   FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),          \
1723   TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1724
1725 #define FBC_BOUND_A(TEST_NON_UTF8)                                             \
1726  FBC_BOUND_COMMON(                                                          \
1727    FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),           \
1728    TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1729
1730 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8)                          \
1731  FBC_BOUND_COMMON(                                                          \
1732   FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),          \
1733   TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1734
1735 #define FBC_NBOUND_A(TEST_NON_UTF8)                                            \
1736  FBC_BOUND_COMMON(                                                          \
1737    FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT),           \
1738    TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1739
1740 /* Takes a pointer to an inversion list, a pointer to its corresponding
1741  * inversion map, and a code point, and returns the code point's value
1742  * according to the two arrays.  It assumes that all code points have a value.
1743  * This is used as the base macro for macros for particular properties */
1744 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp)              \
1745        invmap[_invlist_search(invlist, cp)]
1746
1747 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1748  * of a code point, returning the value for the first code point in the string.
1749  * And it takes the particular macro name that finds the desired value given a
1750  * code point.  Merely convert the UTF-8 to code point and call the cp macro */
1751 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend)                     \
1752    (__ASSERT_(pos < strend)                                          \
1753     /* Note assumes is valid UTF-8 */                             \
1754    (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1755
1756 /* Returns the GCB value for the input code point */
1757 #define getGCB_VAL_CP(cp)                                                      \
1758   _generic_GET_BREAK_VAL_CP(                                           \
1759          PL_GCB_invlist,                            \
1760          Grapheme_Cluster_Break_invmap,             \
1761          (cp))
1762
1763 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1764  * bounded by pos and strend */
1765 #define getGCB_VAL_UTF8(pos, strend)                                           \
1766  _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1767
1768
1769 /* Returns the SB value for the input code point */
1770 #define getSB_VAL_CP(cp)                                                       \
1771   _generic_GET_BREAK_VAL_CP(                                           \
1772          PL_SB_invlist,                             \
1773          Sentence_Break_invmap,                     \
1774          (cp))
1775
1776 /* Returns the SB value for the first code point in the UTF-8 encoded string
1777  * bounded by pos and strend */
1778 #define getSB_VAL_UTF8(pos, strend)                                            \
1779  _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1780
1781 /* Returns the WB value for the input code point */
1782 #define getWB_VAL_CP(cp)                                                       \
1783   _generic_GET_BREAK_VAL_CP(                                           \
1784          PL_WB_invlist,                             \
1785          Word_Break_invmap,                         \
1786          (cp))
1787
1788 /* Returns the WB value for the first code point in the UTF-8 encoded string
1789  * bounded by pos and strend */
1790 #define getWB_VAL_UTF8(pos, strend)                                            \
1791  _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1792
1793 /* We know what class REx starts with.  Try to find this position... */
1794 /* if reginfo->intuit, its a dryrun */
1795 /* annoyingly all the vars in this routine have different names from their counterparts
1796    in regmatch. /grrr */
1797 STATIC char *
1798 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1799  const char *strend, regmatch_info *reginfo)
1800 {
1801  dVAR;
1802  const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1803  char *pat_string;   /* The pattern's exactish string */
1804  char *pat_end;     /* ptr to end char of pat_string */
1805  re_fold_t folder; /* Function for computing non-utf8 folds */
1806  const U8 *fold_array;   /* array for folding ords < 256 */
1807  STRLEN ln;
1808  STRLEN lnc;
1809  U8 c1;
1810  U8 c2;
1811  char *e;
1812  I32 tmp = 1; /* Scratch variable? */
1813  const bool utf8_target = reginfo->is_utf8_target;
1814  UV utf8_fold_flags = 0;
1815  const bool is_utf8_pat = reginfo->is_utf8_pat;
1816  bool to_complement = FALSE; /* Invert the result?  Taking the xor of this
1817         with a result inverts that result, as 0^1 =
1818         1 and 1^1 = 0 */
1819  _char_class_number classnum;
1820
1821  RXi_GET_DECL(prog,progi);
1822
1823  PERL_ARGS_ASSERT_FIND_BYCLASS;
1824
1825  /* We know what class it must start with. */
1826  switch (OP(c)) {
1827  case ANYOFL:
1828   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1829   /* FALLTHROUGH */
1830  case ANYOF:
1831   if (utf8_target) {
1832    REXEC_FBC_UTF8_CLASS_SCAN(
1833      reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1834   }
1835   else {
1836    REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1837   }
1838   break;
1839
1840  case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
1841   assert(! is_utf8_pat);
1842   /* FALLTHROUGH */
1843  case EXACTFA:
1844   if (is_utf8_pat || utf8_target) {
1845    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1846    goto do_exactf_utf8;
1847   }
1848   fold_array = PL_fold_latin1;    /* Latin1 folds are not affected by */
1849   folder = foldEQ_latin1;         /* /a, except the sharp s one which */
1850   goto do_exactf_non_utf8; /* isn't dealt with by these */
1851
1852  case EXACTF:   /* This node only generated for non-utf8 patterns */
1853   assert(! is_utf8_pat);
1854   if (utf8_target) {
1855    utf8_fold_flags = 0;
1856    goto do_exactf_utf8;
1857   }
1858   fold_array = PL_fold;
1859   folder = foldEQ;
1860   goto do_exactf_non_utf8;
1861
1862  case EXACTFL:
1863   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1864   if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1865    utf8_fold_flags = FOLDEQ_LOCALE;
1866    goto do_exactf_utf8;
1867   }
1868   fold_array = PL_fold_locale;
1869   folder = foldEQ_locale;
1870   goto do_exactf_non_utf8;
1871
1872  case EXACTFU_SS:
1873   if (is_utf8_pat) {
1874    utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1875   }
1876   goto do_exactf_utf8;
1877
1878  case EXACTFLU8:
1879    if (! utf8_target) {    /* All code points in this node require
1880          UTF-8 to express.  */
1881     break;
1882    }
1883    utf8_fold_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1884            | FOLDEQ_S2_FOLDS_SANE;
1885    goto do_exactf_utf8;
1886
1887  case EXACTFU:
1888   if (is_utf8_pat || utf8_target) {
1889    utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1890    goto do_exactf_utf8;
1891   }
1892
1893   /* Any 'ss' in the pattern should have been replaced by regcomp,
1894   * so we don't have to worry here about this single special case
1895   * in the Latin1 range */
1896   fold_array = PL_fold_latin1;
1897   folder = foldEQ_latin1;
1898
1899   /* FALLTHROUGH */
1900
1901  do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1902       are no glitches with fold-length differences
1903       between the target string and pattern */
1904
1905   /* The idea in the non-utf8 EXACTF* cases is to first find the
1906   * first character of the EXACTF* node and then, if necessary,
1907   * case-insensitively compare the full text of the node.  c1 is the
1908   * first character.  c2 is its fold.  This logic will not work for
1909   * Unicode semantics and the german sharp ss, which hence should
1910   * not be compiled into a node that gets here. */
1911   pat_string = STRING(c);
1912   ln  = STR_LEN(c); /* length to match in octets/bytes */
1913
1914   /* We know that we have to match at least 'ln' bytes (which is the
1915   * same as characters, since not utf8).  If we have to match 3
1916   * characters, and there are only 2 availabe, we know without
1917   * trying that it will fail; so don't start a match past the
1918   * required minimum number from the far end */
1919   e = HOP3c(strend, -((SSize_t)ln), s);
1920
1921   if (reginfo->intuit && e < s) {
1922    e = s;   /* Due to minlen logic of intuit() */
1923   }
1924
1925   c1 = *pat_string;
1926   c2 = fold_array[c1];
1927   if (c1 == c2) { /* If char and fold are the same */
1928    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1929   }
1930   else {
1931    REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1932   }
1933   break;
1934
1935  do_exactf_utf8:
1936  {
1937   unsigned expansion;
1938
1939   /* If one of the operands is in utf8, we can't use the simpler folding
1940   * above, due to the fact that many different characters can have the
1941   * same fold, or portion of a fold, or different- length fold */
1942   pat_string = STRING(c);
1943   ln  = STR_LEN(c); /* length to match in octets/bytes */
1944   pat_end = pat_string + ln;
1945   lnc = is_utf8_pat       /* length to match in characters */
1946     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1947     : ln;
1948
1949   /* We have 'lnc' characters to match in the pattern, but because of
1950   * multi-character folding, each character in the target can match
1951   * up to 3 characters (Unicode guarantees it will never exceed
1952   * this) if it is utf8-encoded; and up to 2 if not (based on the
1953   * fact that the Latin 1 folds are already determined, and the
1954   * only multi-char fold in that range is the sharp-s folding to
1955   * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
1956   * string character.  Adjust lnc accordingly, rounding up, so that
1957   * if we need to match at least 4+1/3 chars, that really is 5. */
1958   expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1959   lnc = (lnc + expansion - 1) / expansion;
1960
1961   /* As in the non-UTF8 case, if we have to match 3 characters, and
1962   * only 2 are left, it's guaranteed to fail, so don't start a
1963   * match that would require us to go beyond the end of the string
1964   */
1965   e = HOP3c(strend, -((SSize_t)lnc), s);
1966
1967   if (reginfo->intuit && e < s) {
1968    e = s;   /* Due to minlen logic of intuit() */
1969   }
1970
1971   /* XXX Note that we could recalculate e to stop the loop earlier,
1972   * as the worst case expansion above will rarely be met, and as we
1973   * go along we would usually find that e moves further to the left.
1974   * This would happen only after we reached the point in the loop
1975   * where if there were no expansion we should fail.  Unclear if
1976   * worth the expense */
1977
1978   while (s <= e) {
1979    char *my_strend= (char *)strend;
1980    if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
1981     pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1982     && (reginfo->intuit || regtry(reginfo, &s)) )
1983    {
1984     goto got_it;
1985    }
1986    s += (utf8_target) ? UTF8SKIP(s) : 1;
1987   }
1988   break;
1989  }
1990
1991  case BOUNDL:
1992   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1993   if (FLAGS(c) != TRADITIONAL_BOUND) {
1994    if (! IN_UTF8_CTYPE_LOCALE) {
1995     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
1996             B_ON_NON_UTF8_LOCALE_IS_WRONG);
1997    }
1998    goto do_boundu;
1999   }
2000
2001   FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2002   break;
2003
2004  case NBOUNDL:
2005   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2006   if (FLAGS(c) != TRADITIONAL_BOUND) {
2007    if (! IN_UTF8_CTYPE_LOCALE) {
2008     Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2009             B_ON_NON_UTF8_LOCALE_IS_WRONG);
2010    }
2011    goto do_nboundu;
2012   }
2013
2014   FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2015   break;
2016
2017  case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2018     meaning */
2019   assert(FLAGS(c) == TRADITIONAL_BOUND);
2020
2021   FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2022   break;
2023
2024  case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2025     meaning */
2026   assert(FLAGS(c) == TRADITIONAL_BOUND);
2027
2028   FBC_BOUND_A(isWORDCHAR_A);
2029   break;
2030
2031  case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2032     meaning */
2033   assert(FLAGS(c) == TRADITIONAL_BOUND);
2034
2035   FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2036   break;
2037
2038  case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2039     meaning */
2040   assert(FLAGS(c) == TRADITIONAL_BOUND);
2041
2042   FBC_NBOUND_A(isWORDCHAR_A);
2043   break;
2044
2045  case NBOUNDU:
2046   if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2047    FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2048    break;
2049   }
2050
2051  do_nboundu:
2052
2053   to_complement = 1;
2054   /* FALLTHROUGH */
2055
2056  case BOUNDU:
2057  do_boundu:
2058   switch((bound_type) FLAGS(c)) {
2059    case TRADITIONAL_BOUND:
2060     FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2061     break;
2062    case GCB_BOUND:
2063     if (s == reginfo->strbeg) { /* GCB always matches at begin and
2064            end */
2065      if (to_complement ^ cBOOL(reginfo->intuit
2066              || regtry(reginfo, &s)))
2067      {
2068       goto got_it;
2069      }
2070      s += (utf8_target) ? UTF8SKIP(s) : 1;
2071     }
2072
2073     if (utf8_target) {
2074      GCB_enum before = getGCB_VAL_UTF8(
2075            reghop3((U8*)s, -1,
2076              (U8*)(reginfo->strbeg)),
2077            (U8*) reginfo->strend);
2078      while (s < strend) {
2079       GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2080               (U8*) reginfo->strend);
2081       if (to_complement ^ isGCB(before, after)) {
2082        if (reginfo->intuit || regtry(reginfo, &s)) {
2083         goto got_it;
2084        }
2085        before = after;
2086       }
2087       s += UTF8SKIP(s);
2088      }
2089     }
2090     else {  /* Not utf8.  Everything is a GCB except between CR and
2091       LF */
2092      while (s < strend) {
2093       if (to_complement ^ (UCHARAT(s - 1) != '\r'
2094            || UCHARAT(s) != '\n'))
2095       {
2096        if (reginfo->intuit || regtry(reginfo, &s)) {
2097         goto got_it;
2098        }
2099        s++;
2100       }
2101      }
2102     }
2103
2104     if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
2105      goto got_it;
2106     }
2107     break;
2108
2109    case SB_BOUND:
2110     if (s == reginfo->strbeg) { /* SB always matches at beginning */
2111      if (to_complement
2112         ^ cBOOL(reginfo->intuit || regtry(reginfo, &s)))
2113      {
2114       goto got_it;
2115      }
2116
2117      /* Didn't match.  Go try at the next position */
2118      s += (utf8_target) ? UTF8SKIP(s) : 1;
2119     }
2120
2121     if (utf8_target) {
2122      SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2123               -1,
2124               (U8*)(reginfo->strbeg)),
2125              (U8*) reginfo->strend);
2126      while (s < strend) {
2127       SB_enum after = getSB_VAL_UTF8((U8*) s,
2128               (U8*) reginfo->strend);
2129       if (to_complement ^ isSB(before,
2130             after,
2131             (U8*) reginfo->strbeg,
2132             (U8*) s,
2133             (U8*) reginfo->strend,
2134             utf8_target))
2135       {
2136        if (reginfo->intuit || regtry(reginfo, &s)) {
2137         goto got_it;
2138        }
2139        before = after;
2140       }
2141       s += UTF8SKIP(s);
2142      }
2143     }
2144     else {  /* Not utf8. */
2145      SB_enum before = getSB_VAL_CP((U8) *(s -1));
2146      while (s < strend) {
2147       SB_enum after = getSB_VAL_CP((U8) *s);
2148       if (to_complement ^ isSB(before,
2149             after,
2150             (U8*) reginfo->strbeg,
2151             (U8*) s,
2152             (U8*) reginfo->strend,
2153             utf8_target))
2154       {
2155        if (reginfo->intuit || regtry(reginfo, &s)) {
2156         goto got_it;
2157        }
2158        before = after;
2159       }
2160       s++;
2161      }
2162     }
2163
2164     /* Here are at the final position in the target string.  The SB
2165     * value is always true here, so matches, depending on other
2166     * constraints */
2167     if (to_complement ^ cBOOL(reginfo->intuit
2168              || regtry(reginfo, &s)))
2169     {
2170      goto got_it;
2171     }
2172
2173     break;
2174
2175    case WB_BOUND:
2176     if (s == reginfo->strbeg) {
2177      if (to_complement ^ cBOOL(reginfo->intuit
2178            || regtry(reginfo, &s)))
2179      {
2180       goto got_it;
2181      }
2182      s += (utf8_target) ? UTF8SKIP(s) : 1;
2183     }
2184
2185     if (utf8_target) {
2186      /* We are at a boundary between char_sub_0 and char_sub_1.
2187      * We also keep track of the value for char_sub_-1 as we
2188      * loop through the line.   Context may be needed to make a
2189      * determination, and if so, this can save having to
2190      * recalculate it */
2191      WB_enum previous = WB_UNKNOWN;
2192      WB_enum before = getWB_VAL_UTF8(
2193            reghop3((U8*)s,
2194              -1,
2195              (U8*)(reginfo->strbeg)),
2196            (U8*) reginfo->strend);
2197      while (s < strend) {
2198       WB_enum after = getWB_VAL_UTF8((U8*) s,
2199               (U8*) reginfo->strend);
2200       if (to_complement ^ isWB(previous,
2201             before,
2202             after,
2203             (U8*) reginfo->strbeg,
2204             (U8*) s,
2205             (U8*) reginfo->strend,
2206             utf8_target))
2207       {
2208        if (reginfo->intuit || regtry(reginfo, &s)) {
2209         goto got_it;
2210        }
2211        previous = before;
2212        before = after;
2213       }
2214       s += UTF8SKIP(s);
2215      }
2216     }
2217     else {  /* Not utf8. */
2218      WB_enum previous = WB_UNKNOWN;
2219      WB_enum before = getWB_VAL_CP((U8) *(s -1));
2220      while (s < strend) {
2221       WB_enum after = getWB_VAL_CP((U8) *s);
2222       if (to_complement ^ isWB(previous,
2223             before,
2224             after,
2225             (U8*) reginfo->strbeg,
2226             (U8*) s,
2227             (U8*) reginfo->strend,
2228             utf8_target))
2229       {
2230        if (reginfo->intuit || regtry(reginfo, &s)) {
2231         goto got_it;
2232        }
2233        previous = before;
2234        before = after;
2235       }
2236       s++;
2237      }
2238     }
2239
2240     if (to_complement ^ cBOOL(reginfo->intuit
2241           || regtry(reginfo, &s)))
2242     {
2243      goto got_it;
2244     }
2245
2246     break;
2247   }
2248   break;
2249
2250  case LNBREAK:
2251   REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2252       is_LNBREAK_latin1_safe(s, strend)
2253   );
2254   break;
2255
2256  /* The argument to all the POSIX node types is the class number to pass to
2257  * _generic_isCC() to build a mask for searching in PL_charclass[] */
2258
2259  case NPOSIXL:
2260   to_complement = 1;
2261   /* FALLTHROUGH */
2262
2263  case POSIXL:
2264   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2265   REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2266       to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2267   break;
2268
2269  case NPOSIXD:
2270   to_complement = 1;
2271   /* FALLTHROUGH */
2272
2273  case POSIXD:
2274   if (utf8_target) {
2275    goto posix_utf8;
2276   }
2277   goto posixa;
2278
2279  case NPOSIXA:
2280   if (utf8_target) {
2281    /* The complement of something that matches only ASCII matches all
2282    * non-ASCII, plus everything in ASCII that isn't in the class. */
2283    REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
2284          || ! _generic_isCC_A(*s, FLAGS(c)));
2285    break;
2286   }
2287
2288   to_complement = 1;
2289   /* FALLTHROUGH */
2290
2291  case POSIXA:
2292  posixa:
2293   /* Don't need to worry about utf8, as it can match only a single
2294   * byte invariant character. */
2295   REXEC_FBC_CLASS_SCAN(
2296       to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2297   break;
2298
2299  case NPOSIXU:
2300   to_complement = 1;
2301   /* FALLTHROUGH */
2302
2303  case POSIXU:
2304   if (! utf8_target) {
2305    REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2306                  FLAGS(c))));
2307   }
2308   else {
2309
2310   posix_utf8:
2311    classnum = (_char_class_number) FLAGS(c);
2312    if (classnum < _FIRST_NON_SWASH_CC) {
2313     while (s < strend) {
2314
2315      /* We avoid loading in the swash as long as possible, but
2316      * should we have to, we jump to a separate loop.  This
2317      * extra 'if' statement is what keeps this code from being
2318      * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2319      if (UTF8_IS_ABOVE_LATIN1(*s)) {
2320       goto found_above_latin1;
2321      }
2322      if ((UTF8_IS_INVARIANT(*s)
2323       && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2324                 classnum)))
2325       || (UTF8_IS_DOWNGRADEABLE_START(*s)
2326        && to_complement ^ cBOOL(
2327         _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
2328                  *(s + 1)),
2329            classnum))))
2330      {
2331       if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2332        goto got_it;
2333       else {
2334        tmp = doevery;
2335       }
2336      }
2337      else {
2338       tmp = 1;
2339      }
2340      s += UTF8SKIP(s);
2341     }
2342    }
2343    else switch (classnum) {    /* These classes are implemented as
2344           macros */
2345     case _CC_ENUM_SPACE:
2346      REXEC_FBC_UTF8_CLASS_SCAN(
2347           to_complement ^ cBOOL(isSPACE_utf8(s)));
2348      break;
2349
2350     case _CC_ENUM_BLANK:
2351      REXEC_FBC_UTF8_CLASS_SCAN(
2352           to_complement ^ cBOOL(isBLANK_utf8(s)));
2353      break;
2354
2355     case _CC_ENUM_XDIGIT:
2356      REXEC_FBC_UTF8_CLASS_SCAN(
2357          to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2358      break;
2359
2360     case _CC_ENUM_VERTSPACE:
2361      REXEC_FBC_UTF8_CLASS_SCAN(
2362          to_complement ^ cBOOL(isVERTWS_utf8(s)));
2363      break;
2364
2365     case _CC_ENUM_CNTRL:
2366      REXEC_FBC_UTF8_CLASS_SCAN(
2367           to_complement ^ cBOOL(isCNTRL_utf8(s)));
2368      break;
2369
2370     default:
2371      Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2372      NOT_REACHED; /* NOTREACHED */
2373    }
2374   }
2375   break;
2376
2377  found_above_latin1:   /* Here we have to load a swash to get the result
2378        for the current code point */
2379   if (! PL_utf8_swash_ptrs[classnum]) {
2380    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2381    PL_utf8_swash_ptrs[classnum] =
2382      _core_swash_init("utf8",
2383          "",
2384          &PL_sv_undef, 1, 0,
2385          PL_XPosix_ptrs[classnum], &flags);
2386   }
2387
2388   /* This is a copy of the loop above for swash classes, though using the
2389   * FBC macro instead of being expanded out.  Since we've loaded the
2390   * swash, we don't have to check for that each time through the loop */
2391   REXEC_FBC_UTF8_CLASS_SCAN(
2392     to_complement ^ cBOOL(_generic_utf8(
2393          classnum,
2394          s,
2395          swash_fetch(PL_utf8_swash_ptrs[classnum],
2396             (U8 *) s, TRUE))));
2397   break;
2398
2399  case AHOCORASICKC:
2400  case AHOCORASICK:
2401   {
2402    DECL_TRIE_TYPE(c);
2403    /* what trie are we using right now */
2404    reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2405    reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2406    HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2407
2408    const char *last_start = strend - trie->minlen;
2409 #ifdef DEBUGGING
2410    const char *real_start = s;
2411 #endif
2412    STRLEN maxlen = trie->maxlen;
2413    SV *sv_points;
2414    U8 **points; /* map of where we were in the input string
2415        when reading a given char. For ASCII this
2416        is unnecessary overhead as the relationship
2417        is always 1:1, but for Unicode, especially
2418        case folded Unicode this is not true. */
2419    U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2420    U8 *bitmap=NULL;
2421
2422
2423    GET_RE_DEBUG_FLAGS_DECL;
2424
2425    /* We can't just allocate points here. We need to wrap it in
2426    * an SV so it gets freed properly if there is a croak while
2427    * running the match */
2428    ENTER;
2429    SAVETMPS;
2430    sv_points=newSV(maxlen * sizeof(U8 *));
2431    SvCUR_set(sv_points,
2432     maxlen * sizeof(U8 *));
2433    SvPOK_on(sv_points);
2434    sv_2mortal(sv_points);
2435    points=(U8**)SvPV_nolen(sv_points );
2436    if ( trie_type != trie_utf8_fold
2437     && (trie->bitmap || OP(c)==AHOCORASICKC) )
2438    {
2439     if (trie->bitmap)
2440      bitmap=(U8*)trie->bitmap;
2441     else
2442      bitmap=(U8*)ANYOF_BITMAP(c);
2443    }
2444    /* this is the Aho-Corasick algorithm modified a touch
2445    to include special handling for long "unknown char" sequences.
2446    The basic idea being that we use AC as long as we are dealing
2447    with a possible matching char, when we encounter an unknown char
2448    (and we have not encountered an accepting state) we scan forward
2449    until we find a legal starting char.
2450    AC matching is basically that of trie matching, except that when
2451    we encounter a failing transition, we fall back to the current
2452    states "fail state", and try the current char again, a process
2453    we repeat until we reach the root state, state 1, or a legal
2454    transition. If we fail on the root state then we can either
2455    terminate if we have reached an accepting state previously, or
2456    restart the entire process from the beginning if we have not.
2457
2458    */
2459    while (s <= last_start) {
2460     const U32 uniflags = UTF8_ALLOW_DEFAULT;
2461     U8 *uc = (U8*)s;
2462     U16 charid = 0;
2463     U32 base = 1;
2464     U32 state = 1;
2465     UV uvc = 0;
2466     STRLEN len = 0;
2467     STRLEN foldlen = 0;
2468     U8 *uscan = (U8*)NULL;
2469     U8 *leftmost = NULL;
2470 #ifdef DEBUGGING
2471     U32 accepted_word= 0;
2472 #endif
2473     U32 pointpos = 0;
2474
2475     while ( state && uc <= (U8*)strend ) {
2476      int failed=0;
2477      U32 word = aho->states[ state ].wordnum;
2478
2479      if( state==1 ) {
2480       if ( bitmap ) {
2481        DEBUG_TRIE_EXECUTE_r(
2482         if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2483          dump_exec_pos( (char *)uc, c, strend, real_start,
2484           (char *)uc, utf8_target );
2485          PerlIO_printf( Perl_debug_log,
2486           " Scanning for legal start char...\n");
2487         }
2488        );
2489        if (utf8_target) {
2490         while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2491          uc += UTF8SKIP(uc);
2492         }
2493        } else {
2494         while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
2495          uc++;
2496         }
2497        }
2498        s= (char *)uc;
2499       }
2500       if (uc >(U8*)last_start) break;
2501      }
2502
2503      if ( word ) {
2504       U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2505       if (!leftmost || lpos < leftmost) {
2506        DEBUG_r(accepted_word=word);
2507        leftmost= lpos;
2508       }
2509       if (base==0) break;
2510
2511      }
2512      points[pointpos++ % maxlen]= uc;
2513      if (foldlen || uc < (U8*)strend) {
2514       REXEC_TRIE_READ_CHAR(trie_type, trie,
2515           widecharmap, uc,
2516           uscan, len, uvc, charid, foldlen,
2517           foldbuf, uniflags);
2518       DEBUG_TRIE_EXECUTE_r({
2519        dump_exec_pos( (char *)uc, c, strend,
2520           real_start, s, utf8_target);
2521        PerlIO_printf(Perl_debug_log,
2522         " Charid:%3u CP:%4"UVxf" ",
2523         charid, uvc);
2524       });
2525      }
2526      else {
2527       len = 0;
2528       charid = 0;
2529      }
2530
2531
2532      do {
2533 #ifdef DEBUGGING
2534       word = aho->states[ state ].wordnum;
2535 #endif
2536       base = aho->states[ state ].trans.base;
2537
2538       DEBUG_TRIE_EXECUTE_r({
2539        if (failed)
2540         dump_exec_pos( (char *)uc, c, strend, real_start,
2541          s,   utf8_target );
2542        PerlIO_printf( Perl_debug_log,
2543         "%sState: %4"UVxf", word=%"UVxf,
2544         failed ? " Fail transition to " : "",
2545         (UV)state, (UV)word);
2546       });
2547       if ( base ) {
2548        U32 tmp;
2549        I32 offset;
2550        if (charid &&
2551         ( ((offset = base + charid
2552          - 1 - trie->uniquecharcount)) >= 0)
2553         && ((U32)offset < trie->lasttrans)
2554         && trie->trans[offset].check == state
2555         && (tmp=trie->trans[offset].next))
2556        {
2557         DEBUG_TRIE_EXECUTE_r(
2558          PerlIO_printf( Perl_debug_log," - legal\n"));
2559         state = tmp;
2560         break;
2561        }
2562        else {
2563         DEBUG_TRIE_EXECUTE_r(
2564          PerlIO_printf( Perl_debug_log," - fail\n"));
2565         failed = 1;
2566         state = aho->fail[state];
2567        }
2568       }
2569       else {
2570        /* we must be accepting here */
2571        DEBUG_TRIE_EXECUTE_r(
2572          PerlIO_printf( Perl_debug_log," - accepting\n"));
2573        failed = 1;
2574        break;
2575       }
2576      } while(state);
2577      uc += len;
2578      if (failed) {
2579       if (leftmost)
2580        break;
2581       if (!state) state = 1;
2582      }
2583     }
2584     if ( aho->states[ state ].wordnum ) {
2585      U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2586      if (!leftmost || lpos < leftmost) {
2587       DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2588       leftmost = lpos;
2589      }
2590     }
2591     if (leftmost) {
2592      s = (char*)leftmost;
2593      DEBUG_TRIE_EXECUTE_r({
2594       PerlIO_printf(
2595        Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2596        (UV)accepted_word, (IV)(s - real_start)
2597       );
2598      });
2599      if (reginfo->intuit || regtry(reginfo, &s)) {
2600       FREETMPS;
2601       LEAVE;
2602       goto got_it;
2603      }
2604      s = HOPc(s,1);
2605      DEBUG_TRIE_EXECUTE_r({
2606       PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2607      });
2608     } else {
2609      DEBUG_TRIE_EXECUTE_r(
2610       PerlIO_printf( Perl_debug_log,"No match.\n"));
2611      break;
2612     }
2613    }
2614    FREETMPS;
2615    LEAVE;
2616   }
2617   break;
2618  default:
2619   Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2620  }
2621  return 0;
2622   got_it:
2623  return s;
2624 }
2625
2626 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2627  * flags have same meanings as with regexec_flags() */
2628
2629 static void
2630 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2631        char *strbeg,
2632        char *strend,
2633        SV *sv,
2634        U32 flags,
2635        bool utf8_target)
2636 {
2637  struct regexp *const prog = ReANY(rx);
2638
2639  if (flags & REXEC_COPY_STR) {
2640 #ifdef PERL_ANY_COW
2641   if (SvCANCOW(sv)) {
2642    if (DEBUG_C_TEST) {
2643     PerlIO_printf(Perl_debug_log,
2644        "Copy on write: regexp capture, type %d\n",
2645        (int) SvTYPE(sv));
2646    }
2647    /* Create a new COW SV to share the match string and store
2648    * in saved_copy, unless the current COW SV in saved_copy
2649    * is valid and suitable for our purpose */
2650    if ((   prog->saved_copy
2651     && SvIsCOW(prog->saved_copy)
2652     && SvPOKp(prog->saved_copy)
2653     && SvIsCOW(sv)
2654     && SvPOKp(sv)
2655     && SvPVX(sv) == SvPVX(prog->saved_copy)))
2656    {
2657     /* just reuse saved_copy SV */
2658     if (RXp_MATCH_COPIED(prog)) {
2659      Safefree(prog->subbeg);
2660      RXp_MATCH_COPIED_off(prog);
2661     }
2662    }
2663    else {
2664     /* create new COW SV to share string */
2665     RX_MATCH_COPY_FREE(rx);
2666     prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2667    }
2668    prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2669    assert (SvPOKp(prog->saved_copy));
2670    prog->sublen  = strend - strbeg;
2671    prog->suboffset = 0;
2672    prog->subcoffset = 0;
2673   } else
2674 #endif
2675   {
2676    SSize_t min = 0;
2677    SSize_t max = strend - strbeg;
2678    SSize_t sublen;
2679
2680    if (    (flags & REXEC_COPY_SKIP_POST)
2681     && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2682     && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2683    ) { /* don't copy $' part of string */
2684     U32 n = 0;
2685     max = -1;
2686     /* calculate the right-most part of the string covered
2687     * by a capture. Due to look-ahead, this may be to
2688     * the right of $&, so we have to scan all captures */
2689     while (n <= prog->lastparen) {
2690      if (prog->offs[n].end > max)
2691       max = prog->offs[n].end;
2692      n++;
2693     }
2694     if (max == -1)
2695      max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2696        ? prog->offs[0].start
2697        : 0;
2698     assert(max >= 0 && max <= strend - strbeg);
2699    }
2700
2701    if (    (flags & REXEC_COPY_SKIP_PRE)
2702     && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2703     && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2704    ) { /* don't copy $` part of string */
2705     U32 n = 0;
2706     min = max;
2707     /* calculate the left-most part of the string covered
2708     * by a capture. Due to look-behind, this may be to
2709     * the left of $&, so we have to scan all captures */
2710     while (min && n <= prog->lastparen) {
2711      if (   prog->offs[n].start != -1
2712       && prog->offs[n].start < min)
2713      {
2714       min = prog->offs[n].start;
2715      }
2716      n++;
2717     }
2718     if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2719      && min >  prog->offs[0].end
2720     )
2721      min = prog->offs[0].end;
2722
2723    }
2724
2725    assert(min >= 0 && min <= max && min <= strend - strbeg);
2726    sublen = max - min;
2727
2728    if (RX_MATCH_COPIED(rx)) {
2729     if (sublen > prog->sublen)
2730      prog->subbeg =
2731        (char*)saferealloc(prog->subbeg, sublen+1);
2732    }
2733    else
2734     prog->subbeg = (char*)safemalloc(sublen+1);
2735    Copy(strbeg + min, prog->subbeg, sublen, char);
2736    prog->subbeg[sublen] = '\0';
2737    prog->suboffset = min;
2738    prog->sublen = sublen;
2739    RX_MATCH_COPIED_on(rx);
2740   }
2741   prog->subcoffset = prog->suboffset;
2742   if (prog->suboffset && utf8_target) {
2743    /* Convert byte offset to chars.
2744    * XXX ideally should only compute this if @-/@+
2745    * has been seen, a la PL_sawampersand ??? */
2746
2747    /* If there's a direct correspondence between the
2748    * string which we're matching and the original SV,
2749    * then we can use the utf8 len cache associated with
2750    * the SV. In particular, it means that under //g,
2751    * sv_pos_b2u() will use the previously cached
2752    * position to speed up working out the new length of
2753    * subcoffset, rather than counting from the start of
2754    * the string each time. This stops
2755    *   $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2756    * from going quadratic */
2757    if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2758     prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2759             SV_GMAGIC|SV_CONST_RETURN);
2760    else
2761     prog->subcoffset = utf8_length((U8*)strbeg,
2762          (U8*)(strbeg+prog->suboffset));
2763   }
2764  }
2765  else {
2766   RX_MATCH_COPY_FREE(rx);
2767   prog->subbeg = strbeg;
2768   prog->suboffset = 0;
2769   prog->subcoffset = 0;
2770   prog->sublen = strend - strbeg;
2771  }
2772 }
2773
2774
2775
2776
2777 /*
2778  - regexec_flags - match a regexp against a string
2779  */
2780 I32
2781 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2782    char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2783 /* stringarg: the point in the string at which to begin matching */
2784 /* strend:    pointer to null at end of string */
2785 /* strbeg:    real beginning of string */
2786 /* minend:    end of match must be >= minend bytes after stringarg. */
2787 /* sv:        SV being matched: only used for utf8 flag, pos() etc; string
2788  *            itself is accessed via the pointers above */
2789 /* data:      May be used for some additional optimizations.
2790    Currently unused. */
2791 /* flags:     For optimizations. See REXEC_* in regexp.h */
2792
2793 {
2794  struct regexp *const prog = ReANY(rx);
2795  char *s;
2796  regnode *c;
2797  char *startpos;
2798  SSize_t minlen;  /* must match at least this many chars */
2799  SSize_t dontbother = 0; /* how many characters not to try at end */
2800  const bool utf8_target = cBOOL(DO_UTF8(sv));
2801  I32 multiline;
2802  RXi_GET_DECL(prog,progi);
2803  regmatch_info reginfo_buf;  /* create some info to pass to regtry etc */
2804  regmatch_info *const reginfo = &reginfo_buf;
2805  regexp_paren_pair *swap = NULL;
2806  I32 oldsave;
2807  GET_RE_DEBUG_FLAGS_DECL;
2808
2809  PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2810  PERL_UNUSED_ARG(data);
2811
2812  /* Be paranoid... */
2813  if (prog == NULL) {
2814   Perl_croak(aTHX_ "NULL regexp parameter");
2815  }
2816
2817  DEBUG_EXECUTE_r(
2818   debug_start_match(rx, utf8_target, stringarg, strend,
2819   "Matching");
2820  );
2821
2822  startpos = stringarg;
2823
2824  if (prog->intflags & PREGf_GPOS_SEEN) {
2825   MAGIC *mg;
2826
2827   /* set reginfo->ganch, the position where \G can match */
2828
2829   reginfo->ganch =
2830    (flags & REXEC_IGNOREPOS)
2831    ? stringarg /* use start pos rather than pos() */
2832    : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2833    /* Defined pos(): */
2834    ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2835    : strbeg; /* pos() not defined; use start of string */
2836
2837   DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2838    "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2839
2840   /* in the presence of \G, we may need to start looking earlier in
2841   * the string than the suggested start point of stringarg:
2842   * if prog->gofs is set, then that's a known, fixed minimum
2843   * offset, such as
2844   * /..\G/:   gofs = 2
2845   * /ab|c\G/: gofs = 1
2846   * or if the minimum offset isn't known, then we have to go back
2847   * to the start of the string, e.g. /w+\G/
2848   */
2849
2850   if (prog->intflags & PREGf_ANCH_GPOS) {
2851    startpos  = reginfo->ganch - prog->gofs;
2852    if (startpos <
2853     ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2854    {
2855     DEBUG_r(PerlIO_printf(Perl_debug_log,
2856       "fail: ganch-gofs before earliest possible start\n"));
2857     return 0;
2858    }
2859   }
2860   else if (prog->gofs) {
2861    if (startpos - prog->gofs < strbeg)
2862     startpos = strbeg;
2863    else
2864     startpos -= prog->gofs;
2865   }
2866   else if (prog->intflags & PREGf_GPOS_FLOAT)
2867    startpos = strbeg;
2868  }
2869
2870  minlen = prog->minlen;
2871  if ((startpos + minlen) > strend || startpos < strbeg) {
2872   DEBUG_r(PerlIO_printf(Perl_debug_log,
2873      "Regex match can't succeed, so not even tried\n"));
2874   return 0;
2875  }
2876
2877  /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2878  * which will call destuctors to reset PL_regmatch_state, free higher
2879  * PL_regmatch_slabs, and clean up regmatch_info_aux and
2880  * regmatch_info_aux_eval */
2881
2882  oldsave = PL_savestack_ix;
2883
2884  s = startpos;
2885
2886  if ((prog->extflags & RXf_USE_INTUIT)
2887   && !(flags & REXEC_CHECKED))
2888  {
2889   s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2890          flags, NULL);
2891   if (!s)
2892    return 0;
2893
2894   if (prog->extflags & RXf_CHECK_ALL) {
2895    /* we can match based purely on the result of INTUIT.
2896    * Set up captures etc just for $& and $-[0]
2897    * (an intuit-only match wont have $1,$2,..) */
2898    assert(!prog->nparens);
2899
2900    /* s/// doesn't like it if $& is earlier than where we asked it to
2901    * start searching (which can happen on something like /.\G/) */
2902    if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
2903      && (s < stringarg))
2904    {
2905     /* this should only be possible under \G */
2906     assert(prog->intflags & PREGf_GPOS_SEEN);
2907     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2908      "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2909     goto phooey;
2910    }
2911
2912    /* match via INTUIT shouldn't have any captures.
2913    * Let @-, @+, $^N know */
2914    prog->lastparen = prog->lastcloseparen = 0;
2915    RX_MATCH_UTF8_set(rx, utf8_target);
2916    prog->offs[0].start = s - strbeg;
2917    prog->offs[0].end = utf8_target
2918     ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2919     : s - strbeg + prog->minlenret;
2920    if ( !(flags & REXEC_NOT_FIRST) )
2921     S_reg_set_capture_string(aTHX_ rx,
2922           strbeg, strend,
2923           sv, flags, utf8_target);
2924
2925    return 1;
2926   }
2927  }
2928
2929  multiline = prog->extflags & RXf_PMf_MULTILINE;
2930
2931  if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2932   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2933        "String too short [regexec_flags]...\n"));
2934   goto phooey;
2935  }
2936
2937  /* Check validity of program. */
2938  if (UCHARAT(progi->program) != REG_MAGIC) {
2939   Perl_croak(aTHX_ "corrupted regexp program");
2940  }
2941
2942  RX_MATCH_TAINTED_off(rx);
2943  RX_MATCH_UTF8_set(rx, utf8_target);
2944
2945  reginfo->prog = rx;  /* Yes, sorry that this is confusing.  */
2946  reginfo->intuit = 0;
2947  reginfo->is_utf8_target = cBOOL(utf8_target);
2948  reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2949  reginfo->warned = FALSE;
2950  reginfo->strbeg  = strbeg;
2951  reginfo->sv = sv;
2952  reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2953  reginfo->strend = strend;
2954  /* see how far we have to get to not match where we matched before */
2955  reginfo->till = stringarg + minend;
2956
2957  if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2958   /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2959   S_cleanup_regmatch_info_aux has executed (registered by
2960   SAVEDESTRUCTOR_X below).  S_cleanup_regmatch_info_aux modifies
2961   magic belonging to this SV.
2962   Not newSVsv, either, as it does not COW.
2963   */
2964   reginfo->sv = newSV(0);
2965   SvSetSV_nosteal(reginfo->sv, sv);
2966   SAVEFREESV(reginfo->sv);
2967  }
2968
2969  /* reserve next 2 or 3 slots in PL_regmatch_state:
2970  * slot N+0: may currently be in use: skip it
2971  * slot N+1: use for regmatch_info_aux struct
2972  * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2973  * slot N+3: ready for use by regmatch()
2974  */
2975
2976  {
2977   regmatch_state *old_regmatch_state;
2978   regmatch_slab  *old_regmatch_slab;
2979   int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2980
2981   /* on first ever match, allocate first slab */
2982   if (!PL_regmatch_slab) {
2983    Newx(PL_regmatch_slab, 1, regmatch_slab);
2984    PL_regmatch_slab->prev = NULL;
2985    PL_regmatch_slab->next = NULL;
2986    PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2987   }
2988
2989   old_regmatch_state = PL_regmatch_state;
2990   old_regmatch_slab  = PL_regmatch_slab;
2991
2992   for (i=0; i <= max; i++) {
2993    if (i == 1)
2994     reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2995    else if (i ==2)
2996     reginfo->info_aux_eval =
2997     reginfo->info_aux->info_aux_eval =
2998        &(PL_regmatch_state->u.info_aux_eval);
2999
3000    if (++PL_regmatch_state >  SLAB_LAST(PL_regmatch_slab))
3001     PL_regmatch_state = S_push_slab(aTHX);
3002   }
3003
3004   /* note initial PL_regmatch_state position; at end of match we'll
3005   * pop back to there and free any higher slabs */
3006
3007   reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3008   reginfo->info_aux->old_regmatch_slab  = old_regmatch_slab;
3009   reginfo->info_aux->poscache = NULL;
3010
3011   SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3012
3013   if ((prog->extflags & RXf_EVAL_SEEN))
3014    S_setup_eval_state(aTHX_ reginfo);
3015   else
3016    reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3017  }
3018
3019  /* If there is a "must appear" string, look for it. */
3020
3021  if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3022   /* We have to be careful. If the previous successful match
3023   was from this regex we don't want a subsequent partially
3024   successful match to clobber the old results.
3025   So when we detect this possibility we add a swap buffer
3026   to the re, and switch the buffer each match. If we fail,
3027   we switch it back; otherwise we leave it swapped.
3028   */
3029   swap = prog->offs;
3030   /* do we need a save destructor here for eval dies? */
3031   Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3032   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3033    "rex=0x%"UVxf" saving  offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3034    PTR2UV(prog),
3035    PTR2UV(swap),
3036    PTR2UV(prog->offs)
3037   ));
3038  }
3039
3040  /* Simplest case: anchored match need be tried only once, or with
3041  * MBOL, only at the beginning of each line.
3042  *
3043  * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3044  * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3045  * match at the start of the string then it won't match anywhere else
3046  * either; while with /.*.../, if it doesn't match at the beginning,
3047  * the earliest it could match is at the start of the next line */
3048
3049  if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3050   char *end;
3051
3052   if (regtry(reginfo, &s))
3053    goto got_it;
3054
3055   if (!(prog->intflags & PREGf_ANCH_MBOL))
3056    goto phooey;
3057
3058   /* didn't match at start, try at other newline positions */
3059
3060   if (minlen)
3061    dontbother = minlen - 1;
3062   end = HOP3c(strend, -dontbother, strbeg) - 1;
3063
3064   /* skip to next newline */
3065
3066   while (s <= end) { /* note it could be possible to match at the end of the string */
3067    /* NB: newlines are the same in unicode as they are in latin */
3068    if (*s++ != '\n')
3069     continue;
3070    if (prog->check_substr || prog->check_utf8) {
3071    /* note that with PREGf_IMPLICIT, intuit can only fail
3072    * or return the start position, so it's of limited utility.
3073    * Nevertheless, I made the decision that the potential for
3074    * quick fail was still worth it - DAPM */
3075     s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3076     if (!s)
3077      goto phooey;
3078    }
3079    if (regtry(reginfo, &s))
3080     goto got_it;
3081   }
3082   goto phooey;
3083  } /* end anchored search */
3084
3085  if (prog->intflags & PREGf_ANCH_GPOS)
3086  {
3087   /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3088   assert(prog->intflags & PREGf_GPOS_SEEN);
3089   /* For anchored \G, the only position it can match from is
3090   * (ganch-gofs); we already set startpos to this above; if intuit
3091   * moved us on from there, we can't possibly succeed */
3092   assert(startpos == reginfo->ganch - prog->gofs);
3093   if (s == startpos && regtry(reginfo, &s))
3094    goto got_it;
3095   goto phooey;
3096  }
3097
3098  /* Messy cases:  unanchored match. */
3099  if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3100   /* we have /x+whatever/ */
3101   /* it must be a one character string (XXXX Except is_utf8_pat?) */
3102   char ch;
3103 #ifdef DEBUGGING
3104   int did_match = 0;
3105 #endif
3106   if (utf8_target) {
3107    if (! prog->anchored_utf8) {
3108     to_utf8_substr(prog);
3109    }
3110    ch = SvPVX_const(prog->anchored_utf8)[0];
3111    REXEC_FBC_SCAN(
3112     if (*s == ch) {
3113      DEBUG_EXECUTE_r( did_match = 1 );
3114      if (regtry(reginfo, &s)) goto got_it;
3115      s += UTF8SKIP(s);
3116      while (s < strend && *s == ch)
3117       s += UTF8SKIP(s);
3118     }
3119    );
3120
3121   }
3122   else {
3123    if (! prog->anchored_substr) {
3124     if (! to_byte_substr(prog)) {
3125      NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3126     }
3127    }
3128    ch = SvPVX_const(prog->anchored_substr)[0];
3129    REXEC_FBC_SCAN(
3130     if (*s == ch) {
3131      DEBUG_EXECUTE_r( did_match = 1 );
3132      if (regtry(reginfo, &s)) goto got_it;
3133      s++;
3134      while (s < strend && *s == ch)
3135       s++;
3136     }
3137    );
3138   }
3139   DEBUG_EXECUTE_r(if (!did_match)
3140     PerlIO_printf(Perl_debug_log,
3141         "Did not find anchored character...\n")
3142    );
3143  }
3144  else if (prog->anchored_substr != NULL
3145    || prog->anchored_utf8 != NULL
3146    || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3147     && prog->float_max_offset < strend - s)) {
3148   SV *must;
3149   SSize_t back_max;
3150   SSize_t back_min;
3151   char *last;
3152   char *last1;  /* Last position checked before */
3153 #ifdef DEBUGGING
3154   int did_match = 0;
3155 #endif
3156   if (prog->anchored_substr || prog->anchored_utf8) {
3157    if (utf8_target) {
3158     if (! prog->anchored_utf8) {
3159      to_utf8_substr(prog);
3160     }
3161     must = prog->anchored_utf8;
3162    }
3163    else {
3164     if (! prog->anchored_substr) {
3165      if (! to_byte_substr(prog)) {
3166       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3167      }
3168     }
3169     must = prog->anchored_substr;
3170    }
3171    back_max = back_min = prog->anchored_offset;
3172   } else {
3173    if (utf8_target) {
3174     if (! prog->float_utf8) {
3175      to_utf8_substr(prog);
3176     }
3177     must = prog->float_utf8;
3178    }
3179    else {
3180     if (! prog->float_substr) {
3181      if (! to_byte_substr(prog)) {
3182       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3183      }
3184     }
3185     must = prog->float_substr;
3186    }
3187    back_max = prog->float_max_offset;
3188    back_min = prog->float_min_offset;
3189   }
3190
3191   if (back_min<0) {
3192    last = strend;
3193   } else {
3194    last = HOP3c(strend, /* Cannot start after this */
3195     -(SSize_t)(CHR_SVLEN(must)
3196       - (SvTAIL(must) != 0) + back_min), strbeg);
3197   }
3198   if (s > reginfo->strbeg)
3199    last1 = HOPc(s, -1);
3200   else
3201    last1 = s - 1; /* bogus */
3202
3203   /* XXXX check_substr already used to find "s", can optimize if
3204   check_substr==must. */
3205   dontbother = 0;
3206   strend = HOPc(strend, -dontbother);
3207   while ( (s <= last) &&
3208     (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg,  strend),
3209         (unsigned char*)strend, must,
3210         multiline ? FBMrf_MULTILINE : 0)) ) {
3211    DEBUG_EXECUTE_r( did_match = 1 );
3212    if (HOPc(s, -back_max) > last1) {
3213     last1 = HOPc(s, -back_min);
3214     s = HOPc(s, -back_max);
3215    }
3216    else {
3217     char * const t = (last1 >= reginfo->strbeg)
3218          ? HOPc(last1, 1) : last1 + 1;
3219
3220     last1 = HOPc(s, -back_min);
3221     s = t;
3222    }
3223    if (utf8_target) {
3224     while (s <= last1) {
3225      if (regtry(reginfo, &s))
3226       goto got_it;
3227      if (s >= last1) {
3228       s++; /* to break out of outer loop */
3229       break;
3230      }
3231      s += UTF8SKIP(s);
3232     }
3233    }
3234    else {
3235     while (s <= last1) {
3236      if (regtry(reginfo, &s))
3237       goto got_it;
3238      s++;
3239     }
3240    }
3241   }
3242   DEBUG_EXECUTE_r(if (!did_match) {
3243    RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3244     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3245    PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
3246        ((must == prog->anchored_substr || must == prog->anchored_utf8)
3247        ? "anchored" : "floating"),
3248     quoted, RE_SV_TAIL(must));
3249   });
3250   goto phooey;
3251  }
3252  else if ( (c = progi->regstclass) ) {
3253   if (minlen) {
3254    const OPCODE op = OP(progi->regstclass);
3255    /* don't bother with what can't match */
3256    if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3257     strend = HOPc(strend, -(minlen - 1));
3258   }
3259   DEBUG_EXECUTE_r({
3260    SV * const prop = sv_newmortal();
3261    regprop(prog, prop, c, reginfo, NULL);
3262    {
3263     RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3264      s,strend-s,60);
3265     PerlIO_printf(Perl_debug_log,
3266      "Matching stclass %.*s against %s (%d bytes)\n",
3267      (int)SvCUR(prop), SvPVX_const(prop),
3268      quoted, (int)(strend - s));
3269    }
3270   });
3271   if (find_byclass(prog, c, s, strend, reginfo))
3272    goto got_it;
3273   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
3274  }
3275  else {
3276   dontbother = 0;
3277   if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3278    /* Trim the end. */
3279    char *last= NULL;
3280    SV* float_real;
3281    STRLEN len;
3282    const char *little;
3283
3284    if (utf8_target) {
3285     if (! prog->float_utf8) {
3286      to_utf8_substr(prog);
3287     }
3288     float_real = prog->float_utf8;
3289    }
3290    else {
3291     if (! prog->float_substr) {
3292      if (! to_byte_substr(prog)) {
3293       NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3294      }
3295     }
3296     float_real = prog->float_substr;
3297    }
3298
3299    little = SvPV_const(float_real, len);
3300    if (SvTAIL(float_real)) {
3301      /* This means that float_real contains an artificial \n on
3302      * the end due to the presence of something like this:
3303      * /foo$/ where we can match both "foo" and "foo\n" at the
3304      * end of the string.  So we have to compare the end of the
3305      * string first against the float_real without the \n and
3306      * then against the full float_real with the string.  We
3307      * have to watch out for cases where the string might be
3308      * smaller than the float_real or the float_real without
3309      * the \n. */
3310      char *checkpos= strend - len;
3311      DEBUG_OPTIMISE_r(
3312       PerlIO_printf(Perl_debug_log,
3313        "%sChecking for float_real.%s\n",
3314        PL_colors[4], PL_colors[5]));
3315      if (checkpos + 1 < strbeg) {
3316       /* can't match, even if we remove the trailing \n
3317       * string is too short to match */
3318       DEBUG_EXECUTE_r(
3319        PerlIO_printf(Perl_debug_log,
3320         "%sString shorter than required trailing substring, cannot match.%s\n",
3321         PL_colors[4], PL_colors[5]));
3322       goto phooey;
3323      } else if (memEQ(checkpos + 1, little, len - 1)) {
3324       /* can match, the end of the string matches without the
3325       * "\n" */
3326       last = checkpos + 1;
3327      } else if (checkpos < strbeg) {
3328       /* cant match, string is too short when the "\n" is
3329       * included */
3330       DEBUG_EXECUTE_r(
3331        PerlIO_printf(Perl_debug_log,
3332         "%sString does not contain required trailing substring, cannot match.%s\n",
3333         PL_colors[4], PL_colors[5]));
3334       goto phooey;
3335      } else if (!multiline) {
3336       /* non multiline match, so compare with the "\n" at the
3337       * end of the string */
3338       if (memEQ(checkpos, little, len)) {
3339        last= checkpos;
3340       } else {
3341        DEBUG_EXECUTE_r(
3342         PerlIO_printf(Perl_debug_log,
3343          "%sString does not contain required trailing substring, cannot match.%s\n",
3344          PL_colors[4], PL_colors[5]));
3345        goto phooey;
3346       }
3347      } else {
3348       /* multiline match, so we have to search for a place
3349       * where the full string is located */
3350       goto find_last;
3351      }
3352    } else {
3353     find_last:
3354      if (len)
3355       last = rninstr(s, strend, little, little + len);
3356      else
3357       last = strend; /* matching "$" */
3358    }
3359    if (!last) {
3360     /* at one point this block contained a comment which was
3361     * probably incorrect, which said that this was a "should not
3362     * happen" case.  Even if it was true when it was written I am
3363     * pretty sure it is not anymore, so I have removed the comment
3364     * and replaced it with this one. Yves */
3365     DEBUG_EXECUTE_r(
3366      PerlIO_printf(Perl_debug_log,
3367       "%sString does not contain required substring, cannot match.%s\n",
3368       PL_colors[4], PL_colors[5]
3369      ));
3370     goto phooey;
3371    }
3372    dontbother = strend - last + prog->float_min_offset;
3373   }
3374   if (minlen && (dontbother < minlen))
3375    dontbother = minlen - 1;
3376   strend -= dontbother;      /* this one's always in bytes! */
3377   /* We don't know much -- general case. */
3378   if (utf8_target) {
3379    for (;;) {
3380     if (regtry(reginfo, &s))
3381      goto got_it;
3382     if (s >= strend)
3383      break;
3384     s += UTF8SKIP(s);
3385    };
3386   }
3387   else {
3388    do {
3389     if (regtry(reginfo, &s))
3390      goto got_it;
3391    } while (s++ < strend);
3392   }
3393  }
3394
3395  /* Failure. */
3396  goto phooey;
3397
3398   got_it:
3399  /* s/// doesn't like it if $& is earlier than where we asked it to
3400  * start searching (which can happen on something like /.\G/) */
3401  if (       (flags & REXEC_FAIL_ON_UNDERFLOW)
3402    && (prog->offs[0].start < stringarg - strbeg))
3403  {
3404   /* this should only be possible under \G */
3405   assert(prog->intflags & PREGf_GPOS_SEEN);
3406   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3407    "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3408   goto phooey;
3409  }
3410
3411  DEBUG_BUFFERS_r(
3412   if (swap)
3413    PerlIO_printf(Perl_debug_log,
3414     "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3415     PTR2UV(prog),
3416     PTR2UV(swap)
3417    );
3418  );
3419  Safefree(swap);
3420
3421  /* clean up; this will trigger destructors that will free all slabs
3422  * above the current one, and cleanup the regmatch_info_aux
3423  * and regmatch_info_aux_eval sructs */
3424
3425  LEAVE_SCOPE(oldsave);
3426
3427  if (RXp_PAREN_NAMES(prog))
3428   (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3429
3430  /* make sure $`, $&, $', and $digit will work later */
3431  if ( !(flags & REXEC_NOT_FIRST) )
3432   S_reg_set_capture_string(aTHX_ rx,
3433          strbeg, reginfo->strend,
3434          sv, flags, utf8_target);
3435
3436  return 1;
3437
3438   phooey:
3439  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3440       PL_colors[4], PL_colors[5]));
3441
3442  /* clean up; this will trigger destructors that will free all slabs
3443  * above the current one, and cleanup the regmatch_info_aux
3444  * and regmatch_info_aux_eval sructs */
3445
3446  LEAVE_SCOPE(oldsave);
3447
3448  if (swap) {
3449   /* we failed :-( roll it back */
3450   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3451    "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3452    PTR2UV(prog),
3453    PTR2UV(prog->offs),
3454    PTR2UV(swap)
3455   ));
3456   Safefree(prog->offs);
3457   prog->offs = swap;
3458  }
3459  return 0;
3460 }
3461
3462
3463 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3464  * Do inc before dec, in case old and new rex are the same */
3465 #define SET_reg_curpm(Re2)                          \
3466  if (reginfo->info_aux_eval) {                   \
3467   (void)ReREFCNT_inc(Re2);      \
3468   ReREFCNT_dec(PM_GETRE(PL_reg_curpm));     \
3469   PM_SETRE((PL_reg_curpm), (Re2));     \
3470  }
3471
3472
3473 /*
3474  - regtry - try match at specific point
3475  */
3476 STATIC I32   /* 0 failure, 1 success */
3477 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3478 {
3479  CHECKPOINT lastcp;
3480  REGEXP *const rx = reginfo->prog;
3481  regexp *const prog = ReANY(rx);
3482  SSize_t result;
3483  RXi_GET_DECL(prog,progi);
3484  GET_RE_DEBUG_FLAGS_DECL;
3485
3486  PERL_ARGS_ASSERT_REGTRY;
3487
3488  reginfo->cutpoint=NULL;
3489
3490  prog->offs[0].start = *startposp - reginfo->strbeg;
3491  prog->lastparen = 0;
3492  prog->lastcloseparen = 0;
3493
3494  /* XXXX What this code is doing here?!!!  There should be no need
3495  to do this again and again, prog->lastparen should take care of
3496  this!  --ilya*/
3497
3498  /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3499  * Actually, the code in regcppop() (which Ilya may be meaning by
3500  * prog->lastparen), is not needed at all by the test suite
3501  * (op/regexp, op/pat, op/split), but that code is needed otherwise
3502  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3503  * Meanwhile, this code *is* needed for the
3504  * above-mentioned test suite tests to succeed.  The common theme
3505  * on those tests seems to be returning null fields from matches.
3506  * --jhi updated by dapm */
3507 #if 1
3508  if (prog->nparens) {
3509   regexp_paren_pair *pp = prog->offs;
3510   I32 i;
3511   for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3512    ++pp;
3513    pp->start = -1;
3514    pp->end = -1;
3515   }
3516  }
3517 #endif
3518  REGCP_SET(lastcp);
3519  result = regmatch(reginfo, *startposp, progi->program + 1);
3520  if (result != -1) {
3521   prog->offs[0].end = result;
3522   return 1;
3523  }
3524  if (reginfo->cutpoint)
3525   *startposp= reginfo->cutpoint;
3526  REGCP_UNWIND(lastcp);
3527  return 0;
3528 }
3529
3530
3531 #define sayYES goto yes
3532 #define sayNO goto no
3533 #define sayNO_SILENT goto no_silent
3534
3535 /* we dont use STMT_START/END here because it leads to
3536    "unreachable code" warnings, which are bogus, but distracting. */
3537 #define CACHEsayNO \
3538  if (ST.cache_mask) \
3539  reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3540  sayNO
3541
3542 /* this is used to determine how far from the left messages like
3543    'failed...' are printed. It should be set such that messages
3544    are inline with the regop output that created them.
3545 */
3546 #define REPORT_CODE_OFF 32
3547
3548
3549 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3550 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
3551 #define CHRTEST_NOT_A_CP_1 -999
3552 #define CHRTEST_NOT_A_CP_2 -998
3553
3554 /* grab a new slab and return the first slot in it */
3555
3556 STATIC regmatch_state *
3557 S_push_slab(pTHX)
3558 {
3559 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3560  dMY_CXT;
3561 #endif
3562  regmatch_slab *s = PL_regmatch_slab->next;
3563  if (!s) {
3564   Newx(s, 1, regmatch_slab);
3565   s->prev = PL_regmatch_slab;
3566   s->next = NULL;
3567   PL_regmatch_slab->next = s;
3568  }
3569  PL_regmatch_slab = s;
3570  return SLAB_FIRST(s);
3571 }
3572
3573
3574 /* push a new state then goto it */
3575
3576 #define PUSH_STATE_GOTO(state, node, input) \
3577  pushinput = input; \
3578  scan = node; \
3579  st->resume_state = state; \
3580  goto push_state;
3581
3582 /* push a new state with success backtracking, then goto it */
3583
3584 #define PUSH_YES_STATE_GOTO(state, node, input) \
3585  pushinput = input; \
3586  scan = node; \
3587  st->resume_state = state; \
3588  goto push_yes_state;
3589
3590
3591
3592
3593 /*
3594
3595 regmatch() - main matching routine
3596
3597 This is basically one big switch statement in a loop. We execute an op,
3598 set 'next' to point the next op, and continue. If we come to a point which
3599 we may need to backtrack to on failure such as (A|B|C), we push a
3600 backtrack state onto the backtrack stack. On failure, we pop the top
3601 state, and re-enter the loop at the state indicated. If there are no more
3602 states to pop, we return failure.
3603
3604 Sometimes we also need to backtrack on success; for example /A+/, where
3605 after successfully matching one A, we need to go back and try to
3606 match another one; similarly for lookahead assertions: if the assertion
3607 completes successfully, we backtrack to the state just before the assertion
3608 and then carry on.  In these cases, the pushed state is marked as
3609 'backtrack on success too'. This marking is in fact done by a chain of
3610 pointers, each pointing to the previous 'yes' state. On success, we pop to
3611 the nearest yes state, discarding any intermediate failure-only states.
3612 Sometimes a yes state is pushed just to force some cleanup code to be
3613 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3614 it to free the inner regex.
3615
3616 Note that failure backtracking rewinds the cursor position, while
3617 success backtracking leaves it alone.
3618
3619 A pattern is complete when the END op is executed, while a subpattern
3620 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3621 ops trigger the "pop to last yes state if any, otherwise return true"
3622 behaviour.
3623
3624 A common convention in this function is to use A and B to refer to the two
3625 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3626 the subpattern to be matched possibly multiple times, while B is the entire
3627 rest of the pattern. Variable and state names reflect this convention.
3628
3629 The states in the main switch are the union of ops and failure/success of
3630 substates associated with with that op.  For example, IFMATCH is the op
3631 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3632 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3633 successfully matched A and IFMATCH_A_fail is a state saying that we have
3634 just failed to match A. Resume states always come in pairs. The backtrack
3635 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3636 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3637 on success or failure.
3638
3639 The struct that holds a backtracking state is actually a big union, with
3640 one variant for each major type of op. The variable st points to the
3641 top-most backtrack struct. To make the code clearer, within each
3642 block of code we #define ST to alias the relevant union.
3643
3644 Here's a concrete example of a (vastly oversimplified) IFMATCH
3645 implementation:
3646
3647  switch (state) {
3648  ....
3649
3650 #define ST st->u.ifmatch
3651
3652  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3653   ST.foo = ...; // some state we wish to save
3654   ...
3655   // push a yes backtrack state with a resume value of
3656   // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3657   // first node of A:
3658   PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3659   // NOTREACHED
3660
3661  case IFMATCH_A: // we have successfully executed A; now continue with B
3662   next = B;
3663   bar = ST.foo; // do something with the preserved value
3664   break;
3665
3666  case IFMATCH_A_fail: // A failed, so the assertion failed
3667   ...;   // do some housekeeping, then ...
3668   sayNO; // propagate the failure
3669
3670 #undef ST
3671
3672  ...
3673  }
3674
3675 For any old-timers reading this who are familiar with the old recursive
3676 approach, the code above is equivalent to:
3677
3678  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3679  {
3680   int foo = ...
3681   ...
3682   if (regmatch(A)) {
3683    next = B;
3684    bar = foo;
3685    break;
3686   }
3687   ...;   // do some housekeeping, then ...
3688   sayNO; // propagate the failure
3689  }
3690
3691 The topmost backtrack state, pointed to by st, is usually free. If you
3692 want to claim it, populate any ST.foo fields in it with values you wish to
3693 save, then do one of
3694
3695   PUSH_STATE_GOTO(resume_state, node, newinput);
3696   PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3697
3698 which sets that backtrack state's resume value to 'resume_state', pushes a
3699 new free entry to the top of the backtrack stack, then goes to 'node'.
3700 On backtracking, the free slot is popped, and the saved state becomes the
3701 new free state. An ST.foo field in this new top state can be temporarily
3702 accessed to retrieve values, but once the main loop is re-entered, it
3703 becomes available for reuse.
3704
3705 Note that the depth of the backtrack stack constantly increases during the
3706 left-to-right execution of the pattern, rather than going up and down with
3707 the pattern nesting. For example the stack is at its maximum at Z at the
3708 end of the pattern, rather than at X in the following:
3709
3710  /(((X)+)+)+....(Y)+....Z/
3711
3712 The only exceptions to this are lookahead/behind assertions and the cut,
3713 (?>A), which pop all the backtrack states associated with A before
3714 continuing.
3715
3716 Backtrack state structs are allocated in slabs of about 4K in size.
3717 PL_regmatch_state and st always point to the currently active state,
3718 and PL_regmatch_slab points to the slab currently containing
3719 PL_regmatch_state.  The first time regmatch() is called, the first slab is
3720 allocated, and is never freed until interpreter destruction. When the slab
3721 is full, a new one is allocated and chained to the end. At exit from
3722 regmatch(), slabs allocated since entry are freed.
3723
3724 */
3725
3726
3727 #define DEBUG_STATE_pp(pp)        \
3728  DEBUG_STATE_r({         \
3729   DUMP_EXEC_POS(locinput, scan, utf8_target);         \
3730   PerlIO_printf(Perl_debug_log,       \
3731    "    %*s"pp" %s%s%s%s%s\n",       \
3732    depth*2, "",        \
3733    PL_reg_name[st->resume_state],                  \
3734    ((st==yes_state||st==mark_state) ? "[" : ""),   \
3735    ((st==yes_state) ? "Y" : ""),                   \
3736    ((st==mark_state) ? "M" : ""),                  \
3737    ((st==yes_state||st==mark_state) ? "]" : "")    \
3738   );                                                  \
3739  });
3740
3741
3742 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3743
3744 #ifdef DEBUGGING
3745
3746 STATIC void
3747 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3748  const char *start, const char *end, const char *blurb)
3749 {
3750  const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3751
3752  PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3753
3754  if (!PL_colorset)
3755    reginitcolors();
3756  {
3757   RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3758    RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3759
3760   RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3761    start, end - start, 60);
3762
3763   PerlIO_printf(Perl_debug_log,
3764    "%s%s REx%s %s against %s\n",
3765      PL_colors[4], blurb, PL_colors[5], s0, s1);
3766
3767   if (utf8_target||utf8_pat)
3768    PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3769     utf8_pat ? "pattern" : "",
3770     utf8_pat && utf8_target ? " and " : "",
3771     utf8_target ? "string" : ""
3772    );
3773  }
3774 }
3775
3776 STATIC void
3777 S_dump_exec_pos(pTHX_ const char *locinput,
3778      const regnode *scan,
3779      const char *loc_regeol,
3780      const char *loc_bostr,
3781      const char *loc_reg_starttry,
3782      const bool utf8_target)
3783 {
3784  const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3785  const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3786  int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3787  /* The part of the string before starttry has one color
3788  (pref0_len chars), between starttry and current
3789  position another one (pref_len - pref0_len chars),
3790  after the current position the third one.
3791  We assume that pref0_len <= pref_len, otherwise we
3792  decrease pref0_len.  */
3793  int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3794   ? (5 + taill) - l : locinput - loc_bostr;
3795  int pref0_len;
3796
3797  PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3798
3799  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3800   pref_len++;
3801  pref0_len = pref_len  - (locinput - loc_reg_starttry);
3802  if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3803   l = ( loc_regeol - locinput > (5 + taill) - pref_len
3804    ? (5 + taill) - pref_len : loc_regeol - locinput);
3805  while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3806   l--;
3807  if (pref0_len < 0)
3808   pref0_len = 0;
3809  if (pref0_len > pref_len)
3810   pref0_len = pref_len;
3811  {
3812   const int is_uni = utf8_target ? 1 : 0;
3813
3814   RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3815    (locinput - pref_len),pref0_len, 60, 4, 5);
3816
3817   RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3818      (locinput - pref_len + pref0_len),
3819      pref_len - pref0_len, 60, 2, 3);
3820
3821   RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3822      locinput, loc_regeol - locinput, 10, 0, 1);
3823
3824   const STRLEN tlen=len0+len1+len2;
3825   PerlIO_printf(Perl_debug_log,
3826      "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3827      (IV)(locinput - loc_bostr),
3828      len0, s0,
3829      len1, s1,
3830      (docolor ? "" : "> <"),
3831      len2, s2,
3832      (int)(tlen > 19 ? 0 :  19 - tlen),
3833      "");
3834  }
3835 }
3836
3837 #endif
3838
3839 /* reg_check_named_buff_matched()
3840  * Checks to see if a named buffer has matched. The data array of
3841  * buffer numbers corresponding to the buffer is expected to reside
3842  * in the regexp->data->data array in the slot stored in the ARG() of
3843  * node involved. Note that this routine doesn't actually care about the
3844  * name, that information is not preserved from compilation to execution.
3845  * Returns the index of the leftmost defined buffer with the given name
3846  * or 0 if non of the buffers matched.
3847  */
3848 STATIC I32
3849 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3850 {
3851  I32 n;
3852  RXi_GET_DECL(rex,rexi);
3853  SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3854  I32 *nums=(I32*)SvPVX(sv_dat);
3855
3856  PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3857
3858  for ( n=0; n<SvIVX(sv_dat); n++ ) {
3859   if ((I32)rex->lastparen >= nums[n] &&
3860    rex->offs[nums[n]].end != -1)
3861   {
3862    return nums[n];
3863   }
3864  }
3865  return 0;
3866 }
3867
3868
3869 static bool
3870 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3871   U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3872 {
3873  /* This function determines if there are one or two characters that match
3874  * the first character of the passed-in EXACTish node <text_node>, and if
3875  * so, returns them in the passed-in pointers.
3876  *
3877  * If it determines that no possible character in the target string can
3878  * match, it returns FALSE; otherwise TRUE.  (The FALSE situation occurs if
3879  * the first character in <text_node> requires UTF-8 to represent, and the
3880  * target string isn't in UTF-8.)
3881  *
3882  * If there are more than two characters that could match the beginning of
3883  * <text_node>, or if more context is required to determine a match or not,
3884  * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3885  *
3886  * The motiviation behind this function is to allow the caller to set up
3887  * tight loops for matching.  If <text_node> is of type EXACT, there is
3888  * only one possible character that can match its first character, and so
3889  * the situation is quite simple.  But things get much more complicated if
3890  * folding is involved.  It may be that the first character of an EXACTFish
3891  * node doesn't participate in any possible fold, e.g., punctuation, so it
3892  * can be matched only by itself.  The vast majority of characters that are
3893  * in folds match just two things, their lower and upper-case equivalents.
3894  * But not all are like that; some have multiple possible matches, or match
3895  * sequences of more than one character.  This function sorts all that out.
3896  *
3897  * Consider the patterns A*B or A*?B where A and B are arbitrary.  In a
3898  * loop of trying to match A*, we know we can't exit where the thing
3899  * following it isn't a B.  And something can't be a B unless it is the
3900  * beginning of B.  By putting a quick test for that beginning in a tight
3901  * loop, we can rule out things that can't possibly be B without having to
3902  * break out of the loop, thus avoiding work.  Similarly, if A is a single
3903  * character, we can make a tight loop matching A*, using the outputs of
3904  * this function.
3905  *
3906  * If the target string to match isn't in UTF-8, and there aren't
3907  * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3908  * the one or two possible octets (which are characters in this situation)
3909  * that can match.  In all cases, if there is only one character that can
3910  * match, *<c1p> and *<c2p> will be identical.
3911  *
3912  * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3913  * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3914  * can match the beginning of <text_node>.  They should be declared with at
3915  * least length UTF8_MAXBYTES+1.  (If the target string isn't in UTF-8, it is
3916  * undefined what these contain.)  If one or both of the buffers are
3917  * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3918  * corresponding invariant.  If variant, the corresponding *<c1p> and/or
3919  * *<c2p> will be set to a negative number(s) that shouldn't match any code
3920  * point (unless inappropriately coerced to unsigned).   *<c1p> will equal
3921  * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3922
3923  const bool utf8_target = reginfo->is_utf8_target;
3924
3925  UV c1 = (UV)CHRTEST_NOT_A_CP_1;
3926  UV c2 = (UV)CHRTEST_NOT_A_CP_2;
3927  bool use_chrtest_void = FALSE;
3928  const bool is_utf8_pat = reginfo->is_utf8_pat;
3929
3930  /* Used when we have both utf8 input and utf8 output, to avoid converting
3931  * to/from code points */
3932  bool utf8_has_been_setup = FALSE;
3933
3934  dVAR;
3935
3936  U8 *pat = (U8*)STRING(text_node);
3937  U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3938
3939  if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
3940
3941   /* In an exact node, only one thing can be matched, that first
3942   * character.  If both the pat and the target are UTF-8, we can just
3943   * copy the input to the output, avoiding finding the code point of
3944   * that character */
3945   if (!is_utf8_pat) {
3946    c2 = c1 = *pat;
3947   }
3948   else if (utf8_target) {
3949    Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3950    Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3951    utf8_has_been_setup = TRUE;
3952   }
3953   else {
3954    c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3955   }
3956  }
3957  else { /* an EXACTFish node */
3958   U8 *pat_end = pat + STR_LEN(text_node);
3959
3960   /* An EXACTFL node has at least some characters unfolded, because what
3961   * they match is not known until now.  So, now is the time to fold
3962   * the first few of them, as many as are needed to determine 'c1' and
3963   * 'c2' later in the routine.  If the pattern isn't UTF-8, we only need
3964   * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3965   * else is 1-1 and isn't assumed to be folded.  In a UTF-8 pattern, we
3966   * need to fold as many characters as a single character can fold to,
3967   * so that later we can check if the first ones are such a multi-char
3968   * fold.  But, in such a pattern only locale-problematic characters
3969   * aren't folded, so we can skip this completely if the first character
3970   * in the node isn't one of the tricky ones */
3971   if (OP(text_node) == EXACTFL) {
3972
3973    if (! is_utf8_pat) {
3974     if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3975     {
3976      folded[0] = folded[1] = 's';
3977      pat = folded;
3978      pat_end = folded + 2;
3979     }
3980    }
3981    else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3982     U8 *s = pat;
3983     U8 *d = folded;
3984     int i;
3985
3986     for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3987      if (isASCII(*s)) {
3988       *(d++) = (U8) toFOLD_LC(*s);
3989       s++;
3990      }
3991      else {
3992       STRLEN len;
3993       _to_utf8_fold_flags(s,
3994            d,
3995            &len,
3996            FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3997       d += len;
3998       s += UTF8SKIP(s);
3999      }
4000     }
4001
4002     pat = folded;
4003     pat_end = d;
4004    }
4005   }
4006
4007   if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4008    || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4009   {
4010    /* Multi-character folds require more context to sort out.  Also
4011    * PL_utf8_foldclosures used below doesn't handle them, so have to
4012    * be handled outside this routine */
4013    use_chrtest_void = TRUE;
4014   }
4015   else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4016    c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4017    if (c1 > 255) {
4018     /* Load the folds hash, if not already done */
4019     SV** listp;
4020     if (! PL_utf8_foldclosures) {
4021      _load_PL_utf8_foldclosures();
4022     }
4023
4024     /* The fold closures data structure is a hash with the keys
4025     * being the UTF-8 of every character that is folded to, like
4026     * 'k', and the values each an array of all code points that
4027     * fold to its key.  e.g. [ 'k', 'K', KELVIN_SIGN ].
4028     * Multi-character folds are not included */
4029     if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4030           (char *) pat,
4031           UTF8SKIP(pat),
4032           FALSE))))
4033     {
4034      /* Not found in the hash, therefore there are no folds
4035      * containing it, so there is only a single character that
4036      * could match */
4037      c2 = c1;
4038     }
4039     else {  /* Does participate in folds */
4040      AV* list = (AV*) *listp;
4041      if (av_tindex(list) != 1) {
4042
4043       /* If there aren't exactly two folds to this, it is
4044       * outside the scope of this function */
4045       use_chrtest_void = TRUE;
4046      }
4047      else {  /* There are two.  Get them */
4048       SV** c_p = av_fetch(list, 0, FALSE);
4049       if (c_p == NULL) {
4050        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4051       }
4052       c1 = SvUV(*c_p);
4053
4054       c_p = av_fetch(list, 1, FALSE);
4055       if (c_p == NULL) {
4056        Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4057       }
4058       c2 = SvUV(*c_p);
4059
4060       /* Folds that cross the 255/256 boundary are forbidden
4061       * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4062       * one is ASCIII.  Since the pattern character is above
4063       * 255, and its only other match is below 256, the only
4064       * legal match will be to itself.  We have thrown away
4065       * the original, so have to compute which is the one
4066       * above 255. */
4067       if ((c1 < 256) != (c2 < 256)) {
4068        if ((OP(text_node) == EXACTFL
4069         && ! IN_UTF8_CTYPE_LOCALE)
4070         || ((OP(text_node) == EXACTFA
4071          || OP(text_node) == EXACTFA_NO_TRIE)
4072          && (isASCII(c1) || isASCII(c2))))
4073        {
4074         if (c1 < 256) {
4075          c1 = c2;
4076         }
4077         else {
4078          c2 = c1;
4079         }
4080        }
4081       }
4082      }
4083     }
4084    }
4085    else /* Here, c1 is <= 255 */
4086     if (utf8_target
4087      && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4088      && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4089      && ((OP(text_node) != EXACTFA
4090       && OP(text_node) != EXACTFA_NO_TRIE)
4091       || ! isASCII(c1)))
4092    {
4093     /* Here, there could be something above Latin1 in the target
4094     * which folds to this character in the pattern.  All such
4095     * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4096     * than two characters involved in their folds, so are outside
4097     * the scope of this function */
4098     if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4099      c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4100     }
4101     else {
4102      use_chrtest_void = TRUE;
4103     }
4104    }
4105    else { /* Here nothing above Latin1 can fold to the pattern
4106      character */
4107     switch (OP(text_node)) {
4108
4109      case EXACTFL:   /* /l rules */
4110       c2 = PL_fold_locale[c1];
4111       break;
4112
4113      case EXACTF:   /* This node only generated for non-utf8
4114          patterns */
4115       assert(! is_utf8_pat);
4116       if (! utf8_target) {    /* /d rules */
4117        c2 = PL_fold[c1];
4118        break;
4119       }
4120       /* FALLTHROUGH */
4121       /* /u rules for all these.  This happens to work for
4122       * EXACTFA as nothing in Latin1 folds to ASCII */
4123      case EXACTFA_NO_TRIE:   /* This node only generated for
4124            non-utf8 patterns */
4125       assert(! is_utf8_pat);
4126       /* FALLTHROUGH */
4127      case EXACTFA:
4128      case EXACTFU_SS:
4129      case EXACTFU:
4130       c2 = PL_fold_latin1[c1];
4131       break;
4132
4133      default:
4134       Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4135       NOT_REACHED; /* NOTREACHED */
4136     }
4137    }
4138   }
4139  }
4140
4141  /* Here have figured things out.  Set up the returns */
4142  if (use_chrtest_void) {
4143   *c2p = *c1p = CHRTEST_VOID;
4144  }
4145  else if (utf8_target) {
4146   if (! utf8_has_been_setup) {    /* Don't have the utf8; must get it */
4147    uvchr_to_utf8(c1_utf8, c1);
4148    uvchr_to_utf8(c2_utf8, c2);
4149   }
4150
4151   /* Invariants are stored in both the utf8 and byte outputs; Use
4152   * negative numbers otherwise for the byte ones.  Make sure that the
4153   * byte ones are the same iff the utf8 ones are the same */
4154   *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4155   *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4156     ? *c2_utf8
4157     : (c1 == c2)
4158     ? CHRTEST_NOT_A_CP_1
4159     : CHRTEST_NOT_A_CP_2;
4160  }
4161  else if (c1 > 255) {
4162  if (c2 > 255) {  /* both possibilities are above what a non-utf8 string
4163       can represent */
4164   return FALSE;
4165  }
4166
4167  *c1p = *c2p = c2;    /* c2 is the only representable value */
4168  }
4169  else {  /* c1 is representable; see about c2 */
4170  *c1p = c1;
4171  *c2p = (c2 < 256) ? c2 : c1;
4172  }
4173
4174  return TRUE;
4175 }
4176
4177 /* This creates a single number by combining two, with 'before' being like the
4178  * 10's digit, but this isn't necessarily base 10; it is base however many
4179  * elements of the enum there are */
4180 #define GCBcase(before, after) ((GCB_ENUM_COUNT * before) + after)
4181
4182 STATIC bool
4183 S_isGCB(const GCB_enum before, const GCB_enum after)
4184 {
4185  /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4186  * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4187
4188  switch (GCBcase(before, after)) {
4189
4190   /*  Break at the start and end of text.
4191    GB1.   sot ÷
4192    GB2.   ÷ eot
4193
4194    Break before and after controls except between CR and LF
4195    GB4.  ( Control | CR | LF )  ÷
4196    GB5.   ÷  ( Control | CR | LF )
4197
4198    Otherwise, break everywhere.
4199    GB10.  Any  ÷  Any */
4200   default:
4201    return TRUE;
4202
4203   /* Do not break between a CR and LF.
4204    GB3.  CR  ×  LF */
4205   case GCBcase(GCB_CR, GCB_LF):
4206    return FALSE;
4207
4208   /* Do not break Hangul syllable sequences.
4209    GB6.  L  ×  ( L | V | LV | LVT ) */
4210   case GCBcase(GCB_L, GCB_L):
4211   case GCBcase(GCB_L, GCB_V):
4212   case GCBcase(GCB_L, GCB_LV):
4213   case GCBcase(GCB_L, GCB_LVT):
4214    return FALSE;
4215
4216   /*  GB7.  ( LV | V )  ×  ( V | T ) */
4217   case GCBcase(GCB_LV, GCB_V):
4218   case GCBcase(GCB_LV, GCB_T):
4219   case GCBcase(GCB_V, GCB_V):
4220   case GCBcase(GCB_V, GCB_T):
4221    return FALSE;
4222
4223   /*  GB8.  ( LVT | T)  ×  T */
4224   case GCBcase(GCB_LVT, GCB_T):
4225   case GCBcase(GCB_T, GCB_T):
4226    return FALSE;
4227
4228   /* Do not break between regional indicator symbols.
4229    GB8a.  Regional_Indicator  ×  Regional_Indicator */
4230   case GCBcase(GCB_Regional_Indicator, GCB_Regional_Indicator):
4231    return FALSE;
4232
4233   /* Do not break before extending characters.
4234    GB9.     ×  Extend */
4235   case GCBcase(GCB_Other, GCB_Extend):
4236   case GCBcase(GCB_Extend, GCB_Extend):
4237   case GCBcase(GCB_L, GCB_Extend):
4238   case GCBcase(GCB_LV, GCB_Extend):
4239   case GCBcase(GCB_LVT, GCB_Extend):
4240   case GCBcase(GCB_Prepend, GCB_Extend):
4241   case GCBcase(GCB_Regional_Indicator, GCB_Extend):
4242   case GCBcase(GCB_SpacingMark, GCB_Extend):
4243   case GCBcase(GCB_T, GCB_Extend):
4244   case GCBcase(GCB_V, GCB_Extend):
4245    return FALSE;
4246
4247   /* Do not break before SpacingMarks, or after Prepend characters.
4248    GB9a.     ×  SpacingMark */
4249   case GCBcase(GCB_Other, GCB_SpacingMark):
4250   case GCBcase(GCB_Extend, GCB_SpacingMark):
4251   case GCBcase(GCB_L, GCB_SpacingMark):
4252   case GCBcase(GCB_LV, GCB_SpacingMark):
4253   case GCBcase(GCB_LVT, GCB_SpacingMark):
4254   case GCBcase(GCB_Prepend, GCB_SpacingMark):
4255   case GCBcase(GCB_Regional_Indicator, GCB_SpacingMark):
4256   case GCBcase(GCB_SpacingMark, GCB_SpacingMark):
4257   case GCBcase(GCB_T, GCB_SpacingMark):
4258   case GCBcase(GCB_V, GCB_SpacingMark):
4259    return FALSE;
4260
4261   /* GB9b.  Prepend  ×   */
4262   case GCBcase(GCB_Prepend, GCB_Other):
4263   case GCBcase(GCB_Prepend, GCB_L):
4264   case GCBcase(GCB_Prepend, GCB_LV):
4265   case GCBcase(GCB_Prepend, GCB_LVT):
4266   case GCBcase(GCB_Prepend, GCB_Prepend):
4267   case GCBcase(GCB_Prepend, GCB_Regional_Indicator):
4268   case GCBcase(GCB_Prepend, GCB_T):
4269   case GCBcase(GCB_Prepend, GCB_V):
4270    return FALSE;
4271  }
4272
4273  NOT_REACHED; /* NOTREACHED */
4274 }
4275
4276 #define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
4277
4278 STATIC bool
4279 S_isSB(pTHX_ SB_enum before,
4280    SB_enum after,
4281    const U8 * const strbeg,
4282    const U8 * const curpos,
4283    const U8 * const strend,
4284    const bool utf8_target)
4285 {
4286  /* returns a boolean indicating if there is a Sentence Boundary Break
4287  * between the inputs.  See http://www.unicode.org/reports/tr29/ */
4288
4289  U8 * lpos = (U8 *) curpos;
4290  U8 * temp_pos;
4291  SB_enum backup;
4292
4293  PERL_ARGS_ASSERT_ISSB;
4294
4295  /* Break at the start and end of text.
4296   SB1.  sot  ÷
4297   SB2.  ÷  eot */
4298  if (before == SB_EDGE || after == SB_EDGE) {
4299   return TRUE;
4300  }
4301
4302  /* SB 3: Do not break within CRLF. */
4303  if (before == SB_CR && after == SB_LF) {
4304   return FALSE;
4305  }
4306
4307  /* Break after paragraph separators.  (though why CR and LF are considered
4308  * so is beyond me (khw)
4309  SB4.  Sep | CR | LF  ÷ */
4310  if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4311   return TRUE;
4312  }
4313
4314  /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4315  * (See Section 6.2, Replacing Ignore Rules.)
4316   SB5.  X (Extend | Format)*  →  X */
4317  if (after == SB_Extend || after == SB_Format) {
4318   return FALSE;
4319  }
4320
4321  if (before == SB_Extend || before == SB_Format) {
4322   before = backup_one_SB(strbeg, &lpos, utf8_target);
4323  }
4324
4325  /* Do not break after ambiguous terminators like period, if they are
4326  * immediately followed by a number or lowercase letter, if they are
4327  * between uppercase letters, if the first following letter (optionally
4328  * after certain punctuation) is lowercase, or if they are followed by
4329  * "continuation" punctuation such as comma, colon, or semicolon. For
4330  * example, a period may be an abbreviation or numeric period, and thus may
4331  * not mark the end of a sentence.
4332
4333  * SB6. ATerm  ×  Numeric */
4334  if (before == SB_ATerm && after == SB_Numeric) {
4335   return FALSE;
4336  }
4337
4338  /* SB7.  (Upper | Lower) ATerm  ×  Upper */
4339  if (before == SB_ATerm && after == SB_Upper) {
4340   temp_pos = lpos;
4341   backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4342   if (backup == SB_Upper || backup == SB_Lower) {
4343    return FALSE;
4344   }
4345  }
4346
4347  /* SB8a.  (STerm | ATerm) Close* Sp*  ×  (SContinue | STerm | ATerm)
4348  * SB10.  (STerm | ATerm) Close* Sp*  ×  ( Sp | Sep | CR | LF )      */
4349  backup = before;
4350  temp_pos = lpos;
4351  while (backup == SB_Sp) {
4352   backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4353  }
4354  while (backup == SB_Close) {
4355   backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4356  }
4357  if ((backup == SB_STerm || backup == SB_ATerm)
4358   && (   after == SB_SContinue
4359    || after == SB_STerm
4360    || after == SB_ATerm
4361    || after == SB_Sp
4362    || after == SB_Sep
4363    || after == SB_CR
4364    || after == SB_LF))
4365  {
4366   return FALSE;
4367  }
4368
4369  /* SB8.  ATerm Close* Sp*  ×  ( ¬(OLetter | Upper | Lower | Sep | CR | LF |
4370  *                                              STerm | ATerm) )* Lower */
4371  if (backup == SB_ATerm) {
4372   U8 * rpos = (U8 *) curpos;
4373   SB_enum later = after;
4374
4375   while (    later != SB_OLetter
4376     && later != SB_Upper
4377     && later != SB_Lower
4378     && later != SB_Sep
4379     && later != SB_CR
4380     && later != SB_LF
4381     && later != SB_STerm
4382     && later != SB_ATerm
4383     && later != SB_EDGE)
4384   {
4385    later = advance_one_SB(&rpos, strend, utf8_target);
4386   }
4387   if (later == SB_Lower) {
4388    return FALSE;
4389   }
4390  }
4391
4392  /* Break after sentence terminators, but include closing punctuation,
4393  * trailing spaces, and a paragraph separator (if present). [See note
4394  * below.]
4395  * SB9.  ( STerm | ATerm ) Close*  ×  ( Close | Sp | Sep | CR | LF ) */
4396  backup = before;
4397  temp_pos = lpos;
4398  while (backup == SB_Close) {
4399   backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4400  }
4401  if ((backup == SB_STerm || backup == SB_ATerm)
4402   && (   after == SB_Close
4403    || after == SB_Sp
4404    || after == SB_Sep
4405    || after == SB_CR
4406    || after == SB_LF))
4407  {
4408   return FALSE;
4409  }
4410
4411
4412  /* SB11.  ( STerm | ATerm ) Close* Sp* ( Sep | CR | LF )?  ÷ */
4413  temp_pos = lpos;
4414  backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4415  if (   backup == SB_Sep
4416   || backup == SB_CR
4417   || backup == SB_LF)
4418  {
4419   lpos = temp_pos;
4420  }
4421  else {
4422   backup = before;
4423  }
4424  while (backup == SB_Sp) {
4425   backup = backup_one_SB(strbeg, &lpos, utf8_target);
4426  }
4427  while (backup == SB_Close) {
4428   backup = backup_one_SB(strbeg, &lpos, utf8_target);
4429  }
4430  if (backup == SB_STerm || backup == SB_ATerm) {
4431   return TRUE;
4432  }
4433
4434  /* Otherwise, do not break.
4435  SB12.  Any  ×  Any */
4436
4437  return FALSE;
4438 }
4439
4440 STATIC SB_enum
4441 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4442 {
4443  SB_enum sb;
4444
4445  PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4446
4447  if (*curpos >= strend) {
4448   return SB_EDGE;
4449  }
4450
4451  if (utf8_target) {
4452   do {
4453    *curpos += UTF8SKIP(*curpos);
4454    if (*curpos >= strend) {
4455     return SB_EDGE;
4456    }
4457    sb = getSB_VAL_UTF8(*curpos, strend);
4458   } while (sb == SB_Extend || sb == SB_Format);
4459  }
4460  else {
4461   do {
4462    (*curpos)++;
4463    if (*curpos >= strend) {
4464     return SB_EDGE;
4465    }
4466    sb = getSB_VAL_CP(**curpos);
4467   } while (sb == SB_Extend || sb == SB_Format);
4468  }
4469
4470  return sb;
4471 }
4472
4473 STATIC SB_enum
4474 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4475 {
4476  SB_enum sb;
4477
4478  PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4479
4480  if (*curpos < strbeg) {
4481   return SB_EDGE;
4482  }
4483
4484  if (utf8_target) {
4485   U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4486   if (! prev_char_pos) {
4487    return SB_EDGE;
4488   }
4489
4490   /* Back up over Extend and Format.  curpos is always just to the right
4491   * of the characater whose value we are getting */
4492   do {
4493    U8 * prev_prev_char_pos;
4494    if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4495                  strbeg)))
4496    {
4497     sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4498     *curpos = prev_char_pos;
4499     prev_char_pos = prev_prev_char_pos;
4500    }
4501    else {
4502     *curpos = (U8 *) strbeg;
4503     return SB_EDGE;
4504    }
4505   } while (sb == SB_Extend || sb == SB_Format);
4506  }
4507  else {
4508   do {
4509    if (*curpos - 2 < strbeg) {
4510     *curpos = (U8 *) strbeg;
4511     return SB_EDGE;
4512    }
4513    (*curpos)--;
4514    sb = getSB_VAL_CP(*(*curpos - 1));
4515   } while (sb == SB_Extend || sb == SB_Format);
4516  }
4517
4518  return sb;
4519 }
4520
4521 #define WBcase(before, after) ((WB_ENUM_COUNT * before) + after)
4522
4523 STATIC bool
4524 S_isWB(pTHX_ WB_enum previous,
4525    WB_enum before,
4526    WB_enum after,
4527    const U8 * const strbeg,
4528    const U8 * const curpos,
4529    const U8 * const strend,
4530    const bool utf8_target)
4531 {
4532  /*  Return a boolean as to if the boundary between 'before' and 'after' is
4533  *  a Unicode word break, using their published algorithm.  Context may be
4534  *  needed to make this determination.  If the value for the character
4535  *  before 'before' is known, it is passed as 'previous'; otherwise that
4536  *  should be set to WB_UNKNOWN.  The other input parameters give the
4537  *  boundaries and current position in the matching of the string.  That
4538  *  is, 'curpos' marks the position where the character whose wb value is
4539  *  'after' begins.  See http://www.unicode.org/reports/tr29/ */
4540
4541  U8 * before_pos = (U8 *) curpos;
4542  U8 * after_pos = (U8 *) curpos;
4543
4544  PERL_ARGS_ASSERT_ISWB;
4545
4546  /* WB1 and WB2: Break at the start and end of text. */
4547  if (before == WB_EDGE || after == WB_EDGE) {
4548   return TRUE;
4549  }
4550
4551  /* WB 3: Do not break within CRLF. */
4552  if (before == WB_CR && after == WB_LF) {
4553   return FALSE;
4554  }
4555
4556  /* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
4557  * and LF) */
4558  if (   before == WB_CR || before == WB_LF || before == WB_Newline
4559   || after ==  WB_CR || after ==  WB_LF || after ==  WB_Newline)
4560  {
4561   return TRUE;
4562  }
4563
4564  /* Ignore Format and Extend characters, except when they appear at the
4565  * beginning of a region of text.
4566  * WB4.  X (Extend | Format)*  →  X. */
4567
4568  if (after == WB_Extend || after == WB_Format) {
4569   return FALSE;
4570  }
4571
4572  if (before == WB_Extend || before == WB_Format) {
4573   before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
4574  }
4575
4576  switch (WBcase(before, after)) {
4577    /* Otherwise, break everywhere (including around ideographs).
4578     WB14.  Any  ÷  Any */
4579    default:
4580     return TRUE;
4581
4582    /* Do not break between most letters.
4583     WB5.  (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */
4584    case WBcase(WB_ALetter, WB_ALetter):
4585    case WBcase(WB_ALetter, WB_Hebrew_Letter):
4586    case WBcase(WB_Hebrew_Letter, WB_ALetter):
4587    case WBcase(WB_Hebrew_Letter, WB_Hebrew_Letter):
4588     return FALSE;
4589
4590    /* Do not break letters across certain punctuation.
4591     WB6.  (ALetter | Hebrew_Letter)
4592       × (MidLetter | MidNumLet | Single_Quote) (ALetter
4593                | Hebrew_Letter) */
4594    case WBcase(WB_ALetter, WB_MidLetter):
4595    case WBcase(WB_ALetter, WB_MidNumLet):
4596    case WBcase(WB_ALetter, WB_Single_Quote):
4597    case WBcase(WB_Hebrew_Letter, WB_MidLetter):
4598    case WBcase(WB_Hebrew_Letter, WB_MidNumLet):
4599    /*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/
4600     after = advance_one_WB(&after_pos, strend, utf8_target);
4601     return after != WB_ALetter && after != WB_Hebrew_Letter;
4602
4603    /* WB7.  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
4604    *                    Single_Quote) ×  (ALetter | Hebrew_Letter) */
4605    case WBcase(WB_MidLetter, WB_ALetter):
4606    case WBcase(WB_MidLetter, WB_Hebrew_Letter):
4607    case WBcase(WB_MidNumLet, WB_ALetter):
4608    case WBcase(WB_MidNumLet, WB_Hebrew_Letter):
4609    case WBcase(WB_Single_Quote, WB_ALetter):
4610    case WBcase(WB_Single_Quote, WB_Hebrew_Letter):
4611     before
4612     = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
4613     return before != WB_ALetter && before != WB_Hebrew_Letter;
4614
4615    /* WB7a.  Hebrew_Letter  ×  Single_Quote */
4616    case WBcase(WB_Hebrew_Letter, WB_Single_Quote):
4617     return FALSE;
4618
4619    /* WB7b.  Hebrew_Letter  ×  Double_Quote Hebrew_Letter */
4620    case WBcase(WB_Hebrew_Letter, WB_Double_Quote):
4621     return advance_one_WB(&after_pos, strend, utf8_target)
4622               != WB_Hebrew_Letter;
4623
4624    /* WB7c.  Hebrew_Letter Double_Quote  ×  Hebrew_Letter */
4625    case WBcase(WB_Double_Quote, WB_Hebrew_Letter):
4626     return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
4627               != WB_Hebrew_Letter;
4628
4629    /* Do not break within sequences of digits, or digits adjacent to
4630    * letters (“3a”, or “A3”).
4631     WB8.  Numeric  ×  Numeric */
4632    case WBcase(WB_Numeric, WB_Numeric):
4633     return FALSE;
4634
4635    /* WB9.  (ALetter | Hebrew_Letter)  ×  Numeric */
4636    case WBcase(WB_ALetter, WB_Numeric):
4637    case WBcase(WB_Hebrew_Letter, WB_Numeric):
4638     return FALSE;
4639
4640    /* WB10.  Numeric  ×  (ALetter | Hebrew_Letter) */
4641    case WBcase(WB_Numeric, WB_ALetter):
4642    case WBcase(WB_Numeric, WB_Hebrew_Letter):
4643     return FALSE;
4644
4645    /* Do not break within sequences, such as “3.2” or “3,456.789”.
4646     WB11.   Numeric (MidNum | MidNumLet | Single_Quote)  ×  Numeric
4647    */
4648    case WBcase(WB_MidNum, WB_Numeric):
4649    case WBcase(WB_MidNumLet, WB_Numeric):
4650    case WBcase(WB_Single_Quote, WB_Numeric):
4651     return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
4652                != WB_Numeric;
4653
4654    /*  WB12.   Numeric  ×  (MidNum | MidNumLet | Single_Quote) Numeric
4655    *  */
4656    case WBcase(WB_Numeric, WB_MidNum):
4657    case WBcase(WB_Numeric, WB_MidNumLet):
4658    case WBcase(WB_Numeric, WB_Single_Quote):
4659     return advance_one_WB(&after_pos, strend, utf8_target)
4660                != WB_Numeric;
4661
4662    /* Do not break between Katakana.
4663    WB13.  Katakana  ×  Katakana */
4664    case WBcase(WB_Katakana, WB_Katakana):
4665     return FALSE;
4666
4667    /* Do not break from extenders.
4668    WB13a.  (ALetter | Hebrew_Letter | Numeric | Katakana |
4669            ExtendNumLet)  ×  ExtendNumLet */
4670    case WBcase(WB_ALetter, WB_ExtendNumLet):
4671    case WBcase(WB_Hebrew_Letter, WB_ExtendNumLet):
4672    case WBcase(WB_Numeric, WB_ExtendNumLet):
4673    case WBcase(WB_Katakana, WB_ExtendNumLet):
4674    case WBcase(WB_ExtendNumLet, WB_ExtendNumLet):
4675     return FALSE;
4676
4677    /* WB13b.  ExtendNumLet  ×  (ALetter | Hebrew_Letter | Numeric
4678    *                                                 | Katakana) */
4679    case WBcase(WB_ExtendNumLet, WB_ALetter):
4680    case WBcase(WB_ExtendNumLet, WB_Hebrew_Letter):
4681    case WBcase(WB_ExtendNumLet, WB_Numeric):
4682    case WBcase(WB_ExtendNumLet, WB_Katakana):
4683     return FALSE;
4684
4685    /* Do not break between regional indicator symbols.
4686    WB13c.  Regional_Indicator  ×  Regional_Indicator */
4687    case WBcase(WB_Regional_Indicator, WB_Regional_Indicator):
4688     return FALSE;
4689
4690  }
4691
4692  NOT_REACHED; /* NOTREACHED */
4693 }
4694
4695 STATIC WB_enum
4696 S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4697 {
4698  WB_enum wb;
4699
4700  PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
4701
4702  if (*curpos >= strend) {
4703   return WB_EDGE;
4704  }
4705
4706  if (utf8_target) {
4707
4708   /* Advance over Extend and Format */
4709   do {
4710    *curpos += UTF8SKIP(*curpos);
4711    if (*curpos >= strend) {
4712     return WB_EDGE;
4713    }
4714    wb = getWB_VAL_UTF8(*curpos, strend);
4715   } while (wb == WB_Extend || wb == WB_Format);
4716  }
4717  else {
4718   do {
4719    (*curpos)++;
4720    if (*curpos >= strend) {
4721     return WB_EDGE;
4722    }
4723    wb = getWB_VAL_CP(**curpos);
4724   } while (wb == WB_Extend || wb == WB_Format);
4725  }
4726
4727  return wb;
4728 }
4729
4730 STATIC WB_enum
4731 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4732 {
4733  WB_enum wb;
4734
4735  PERL_ARGS_ASSERT_BACKUP_ONE_WB;
4736
4737  /* If we know what the previous character's break value is, don't have
4738   * to look it up */
4739  if (*previous != WB_UNKNOWN) {
4740   wb = *previous;
4741   *previous = WB_UNKNOWN;
4742   /* XXX Note that doesn't change curpos, and maybe should */
4743
4744   /* But we always back up over these two types */
4745   if (wb != WB_Extend && wb != WB_Format) {
4746    return wb;
4747   }
4748  }
4749
4750  if (*curpos < strbeg) {
4751   return WB_EDGE;
4752  }
4753
4754  if (utf8_target) {
4755   U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4756   if (! prev_char_pos) {
4757    return WB_EDGE;
4758   }
4759
4760   /* Back up over Extend and Format.  curpos is always just to the right
4761   * of the characater whose value we are getting */
4762   do {
4763    U8 * prev_prev_char_pos;
4764    if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
4765             -1,
4766             strbeg)))
4767    {
4768     wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4769     *curpos = prev_char_pos;
4770     prev_char_pos = prev_prev_char_pos;
4771    }
4772    else {
4773     *curpos = (U8 *) strbeg;
4774     return WB_EDGE;
4775    }
4776   } while (wb == WB_Extend || wb == WB_Format);
4777  }
4778  else {
4779   do {
4780    if (*curpos - 2 < strbeg) {
4781     *curpos = (U8 *) strbeg;
4782     return WB_EDGE;
4783    }
4784    (*curpos)--;
4785    wb = getWB_VAL_CP(*(*curpos - 1));
4786   } while (wb == WB_Extend || wb == WB_Format);
4787  }
4788
4789  return wb;
4790 }
4791
4792 /* returns -1 on failure, $+[0] on success */
4793 STATIC SSize_t
4794 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
4795 {
4796 #if PERL_VERSION < 9 && !defined(PERL_CORE)
4797  dMY_CXT;
4798 #endif
4799  dVAR;
4800  const bool utf8_target = reginfo->is_utf8_target;
4801  const U32 uniflags = UTF8_ALLOW_DEFAULT;
4802  REGEXP *rex_sv = reginfo->prog;
4803  regexp *rex = ReANY(rex_sv);
4804  RXi_GET_DECL(rex,rexi);
4805  /* the current state. This is a cached copy of PL_regmatch_state */
4806  regmatch_state *st;
4807  /* cache heavy used fields of st in registers */
4808  regnode *scan;
4809  regnode *next;
4810  U32 n = 0; /* general value; init to avoid compiler warning */
4811  SSize_t ln = 0; /* len or last;  init to avoid compiler warning */
4812  char *locinput = startpos;
4813  char *pushinput; /* where to continue after a PUSH */
4814  I32 nextchr;   /* is always set to UCHARAT(locinput) */
4815
4816  bool result = 0;     /* return value of S_regmatch */
4817  int depth = 0;     /* depth of backtrack stack */
4818  U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
4819  const U32 max_nochange_depth =
4820   (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
4821   3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
4822  regmatch_state *yes_state = NULL; /* state to pop to on success of
4823                subpattern */
4824  /* mark_state piggy backs on the yes_state logic so that when we unwind
4825  the stack on success we can update the mark_state as we go */
4826  regmatch_state *mark_state = NULL; /* last mark state we have seen */
4827  regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
4828  struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
4829  U32 state_num;
4830  bool no_final = 0;      /* prevent failure from backtracking? */
4831  bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
4832  char *startpoint = locinput;
4833  SV *popmark = NULL;     /* are we looking for a mark? */
4834  SV *sv_commit = NULL;   /* last mark name seen in failure */
4835  SV *sv_yes_mark = NULL; /* last mark name we have seen
4836        during a successful match */
4837  U32 lastopen = 0;       /* last open we saw */
4838  bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
4839  SV* const oreplsv = GvSVn(PL_replgv);
4840  /* these three flags are set by various ops to signal information to
4841  * the very next op. They have a useful lifetime of exactly one loop
4842  * iteration, and are not preserved or restored by state pushes/pops
4843  */
4844  bool sw = 0;     /* the condition value in (?(cond)a|b) */
4845  bool minmod = 0;     /* the next "{n,m}" is a "{n,m}?" */
4846  int logical = 0;     /* the following EVAL is:
4847         0: (?{...})
4848         1: (?(?{...})X|Y)
4849         2: (??{...})
4850        or the following IFMATCH/UNLESSM is:
4851         false: plain (?=foo)
4852         true:  used as a condition: (?(?=foo))
4853        */
4854  PAD* last_pad = NULL;
4855  dMULTICALL;
4856  I32 gimme = G_SCALAR;
4857  CV *caller_cv = NULL; /* who called us */
4858  CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
4859  CHECKPOINT runops_cp; /* savestack position before executing EVAL */
4860  U32 maxopenparen = 0;       /* max '(' index seen so far */
4861  int to_complement;  /* Invert the result? */
4862  _char_class_number classnum;
4863  bool is_utf8_pat = reginfo->is_utf8_pat;
4864  bool match = FALSE;
4865
4866
4867 #ifdef DEBUGGING
4868  GET_RE_DEBUG_FLAGS_DECL;
4869 #endif
4870
4871  /* protect against undef(*^R) */
4872  SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
4873
4874  /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
4875  multicall_oldcatch = 0;
4876  multicall_cv = NULL;
4877  cx = NULL;
4878  PERL_UNUSED_VAR(multicall_cop);
4879  PERL_UNUSED_VAR(newsp);
4880
4881
4882  PERL_ARGS_ASSERT_REGMATCH;
4883
4884  DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
4885    PerlIO_printf(Perl_debug_log,"regmatch start\n");
4886  }));
4887
4888  st = PL_regmatch_state;
4889
4890  /* Note that nextchr is a byte even in UTF */
4891  SET_nextchr;
4892  scan = prog;
4893  while (scan != NULL) {
4894
4895   DEBUG_EXECUTE_r( {
4896    SV * const prop = sv_newmortal();
4897    regnode *rnext=regnext(scan);
4898    DUMP_EXEC_POS( locinput, scan, utf8_target );
4899    regprop(rex, prop, scan, reginfo, NULL);
4900
4901    PerlIO_printf(Perl_debug_log,
4902      "%3"IVdf":%*s%s(%"IVdf")\n",
4903      (IV)(scan - rexi->program), depth*2, "",
4904      SvPVX_const(prop),
4905      (PL_regkind[OP(scan)] == END || !rnext) ?
4906       0 : (IV)(rnext - rexi->program));
4907   });
4908
4909   next = scan + NEXT_OFF(scan);
4910   if (next == scan)
4911    next = NULL;
4912   state_num = OP(scan);
4913
4914   REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
4915  reenter_switch:
4916   to_complement = 0;
4917
4918   SET_nextchr;
4919   assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
4920
4921   switch (state_num) {
4922   case SBOL: /*  /^../ and /\A../  */
4923    if (locinput == reginfo->strbeg)
4924     break;
4925    sayNO;
4926
4927   case MBOL: /*  /^../m  */
4928    if (locinput == reginfo->strbeg ||
4929     (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
4930    {
4931     break;
4932    }
4933    sayNO;
4934
4935   case GPOS: /*  \G  */
4936    if (locinput == reginfo->ganch)
4937     break;
4938    sayNO;
4939
4940   case KEEPS: /*   \K  */
4941    /* update the startpoint */
4942    st->u.keeper.val = rex->offs[0].start;
4943    rex->offs[0].start = locinput - reginfo->strbeg;
4944    PUSH_STATE_GOTO(KEEPS_next, next, locinput);
4945    /* NOTREACHED */
4946    NOT_REACHED; /* NOTREACHED */
4947
4948   case KEEPS_next_fail:
4949    /* rollback the start point change */
4950    rex->offs[0].start = st->u.keeper.val;
4951    sayNO_SILENT;
4952    /* NOTREACHED */
4953    NOT_REACHED; /* NOTREACHED */
4954
4955   case MEOL: /* /..$/m  */
4956    if (!NEXTCHR_IS_EOS && nextchr != '\n')
4957     sayNO;
4958    break;
4959
4960   case SEOL: /* /..$/  */
4961    if (!NEXTCHR_IS_EOS && nextchr != '\n')
4962     sayNO;
4963    if (reginfo->strend - locinput > 1)
4964     sayNO;
4965    break;
4966
4967   case EOS: /*  \z  */
4968    if (!NEXTCHR_IS_EOS)
4969     sayNO;
4970    break;
4971
4972   case SANY: /*  /./s  */
4973    if (NEXTCHR_IS_EOS)
4974     sayNO;
4975    goto increment_locinput;
4976
4977   case REG_ANY: /*  /./  */
4978    if ((NEXTCHR_IS_EOS) || nextchr == '\n')
4979     sayNO;
4980    goto increment_locinput;
4981
4982
4983 #undef  ST
4984 #define ST st->u.trie
4985   case TRIEC: /* (ab|cd) with known charclass */
4986    /* In this case the charclass data is available inline so
4987    we can fail fast without a lot of extra overhead.
4988    */
4989    if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
4990     DEBUG_EXECUTE_r(
4991      PerlIO_printf(Perl_debug_log,
4992        "%*s  %sfailed to match trie start class...%s\n",
4993        REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4994     );
4995     sayNO_SILENT;
4996     /* NOTREACHED */
4997     NOT_REACHED; /* NOTREACHED */
4998    }
4999    /* FALLTHROUGH */
5000   case TRIE:  /* (ab|cd)  */
5001    /* the basic plan of execution of the trie is:
5002    * At the beginning, run though all the states, and
5003    * find the longest-matching word. Also remember the position
5004    * of the shortest matching word. For example, this pattern:
5005    *    1  2 3 4    5
5006    *    ab|a|x|abcd|abc
5007    * when matched against the string "abcde", will generate
5008    * accept states for all words except 3, with the longest
5009    * matching word being 4, and the shortest being 2 (with
5010    * the position being after char 1 of the string).
5011    *
5012    * Then for each matching word, in word order (i.e. 1,2,4,5),
5013    * we run the remainder of the pattern; on each try setting
5014    * the current position to the character following the word,
5015    * returning to try the next word on failure.
5016    *
5017    * We avoid having to build a list of words at runtime by
5018    * using a compile-time structure, wordinfo[].prev, which
5019    * gives, for each word, the previous accepting word (if any).
5020    * In the case above it would contain the mappings 1->2, 2->0,
5021    * 3->0, 4->5, 5->1.  We can use this table to generate, from
5022    * the longest word (4 above), a list of all words, by
5023    * following the list of prev pointers; this gives us the
5024    * unordered list 4,5,1,2. Then given the current word we have
5025    * just tried, we can go through the list and find the
5026    * next-biggest word to try (so if we just failed on word 2,
5027    * the next in the list is 4).
5028    *
5029    * Since at runtime we don't record the matching position in
5030    * the string for each word, we have to work that out for
5031    * each word we're about to process. The wordinfo table holds
5032    * the character length of each word; given that we recorded
5033    * at the start: the position of the shortest word and its
5034    * length in chars, we just need to move the pointer the
5035    * difference between the two char lengths. Depending on
5036    * Unicode status and folding, that's cheap or expensive.
5037    *
5038    * This algorithm is optimised for the case where are only a
5039    * small number of accept states, i.e. 0,1, or maybe 2.
5040    * With lots of accepts states, and having to try all of them,
5041    * it becomes quadratic on number of accept states to find all
5042    * the next words.
5043    */
5044
5045    {
5046     /* what type of TRIE am I? (utf8 makes this contextual) */
5047     DECL_TRIE_TYPE(scan);
5048
5049     /* what trie are we using right now */
5050     reg_trie_data * const trie
5051      = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5052     HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5053     U32 state = trie->startstate;
5054
5055     if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5056      _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5057      if (utf8_target
5058       && UTF8_IS_ABOVE_LATIN1(nextchr)
5059       && scan->flags == EXACTL)
5060      {
5061       /* We only output for EXACTL, as we let the folder
5062       * output this message for EXACTFLU8 to avoid
5063       * duplication */
5064       _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5065                reginfo->strend);
5066      }
5067     }
5068     if (   trie->bitmap
5069      && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5070     {
5071      if (trie->states[ state ].wordnum) {
5072       DEBUG_EXECUTE_r(
5073        PerlIO_printf(Perl_debug_log,
5074           "%*s  %smatched empty string...%s\n",
5075           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5076       );
5077       if (!trie->jump)
5078        break;
5079      } else {
5080       DEBUG_EXECUTE_r(
5081        PerlIO_printf(Perl_debug_log,
5082           "%*s  %sfailed to match trie start class...%s\n",
5083           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5084       );
5085       sayNO_SILENT;
5086     }
5087     }
5088
5089    {
5090     U8 *uc = ( U8* )locinput;
5091
5092     STRLEN len = 0;
5093     STRLEN foldlen = 0;
5094     U8 *uscan = (U8*)NULL;
5095     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5096     U32 charcount = 0; /* how many input chars we have matched */
5097     U32 accepted = 0; /* have we seen any accepting states? */
5098
5099     ST.jump = trie->jump;
5100     ST.me = scan;
5101     ST.firstpos = NULL;
5102     ST.longfold = FALSE; /* char longer if folded => it's harder */
5103     ST.nextword = 0;
5104
5105     /* fully traverse the TRIE; note the position of the
5106     shortest accept state and the wordnum of the longest
5107     accept state */
5108
5109     while ( state && uc <= (U8*)(reginfo->strend) ) {
5110      U32 base = trie->states[ state ].trans.base;
5111      UV uvc = 0;
5112      U16 charid = 0;
5113      U16 wordnum;
5114      wordnum = trie->states[ state ].wordnum;
5115
5116      if (wordnum) { /* it's an accept state */
5117       if (!accepted) {
5118        accepted = 1;
5119        /* record first match position */
5120        if (ST.longfold) {
5121         ST.firstpos = (U8*)locinput;
5122         ST.firstchars = 0;
5123        }
5124        else {
5125         ST.firstpos = uc;
5126         ST.firstchars = charcount;
5127        }
5128       }
5129       if (!ST.nextword || wordnum < ST.nextword)
5130        ST.nextword = wordnum;
5131       ST.topword = wordnum;
5132      }
5133
5134      DEBUG_TRIE_EXECUTE_r({
5135         DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
5136         PerlIO_printf( Perl_debug_log,
5137          "%*s  %sState: %4"UVxf" Accepted: %c ",
5138          2+depth * 2, "", PL_colors[4],
5139          (UV)state, (accepted ? 'Y' : 'N'));
5140      });
5141
5142      /* read a char and goto next state */
5143      if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5144       I32 offset;
5145       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5146            uscan, len, uvc, charid, foldlen,
5147            foldbuf, uniflags);
5148       charcount++;
5149       if (foldlen>0)
5150        ST.longfold = TRUE;
5151       if (charid &&
5152        ( ((offset =
5153        base + charid - 1 - trie->uniquecharcount)) >= 0)
5154
5155        && ((U32)offset < trie->lasttrans)
5156        && trie->trans[offset].check == state)
5157       {
5158        state = trie->trans[offset].next;
5159       }
5160       else {
5161        state = 0;
5162       }
5163       uc += len;
5164
5165      }
5166      else {
5167       state = 0;
5168      }
5169      DEBUG_TRIE_EXECUTE_r(
5170       PerlIO_printf( Perl_debug_log,
5171        "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
5172        charid, uvc, (UV)state, PL_colors[5] );
5173      );
5174     }
5175     if (!accepted)
5176     sayNO;
5177
5178     /* calculate total number of accept states */
5179     {
5180      U16 w = ST.topword;
5181      accepted = 0;
5182      while (w) {
5183       w = trie->wordinfo[w].prev;
5184       accepted++;
5185      }
5186      ST.accepted = accepted;
5187     }
5188
5189     DEBUG_EXECUTE_r(
5190      PerlIO_printf( Perl_debug_log,
5191       "%*s  %sgot %"IVdf" possible matches%s\n",
5192       REPORT_CODE_OFF + depth * 2, "",
5193       PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5194     );
5195     goto trie_first_try; /* jump into the fail handler */
5196    }}
5197    /* NOTREACHED */
5198    NOT_REACHED; /* NOTREACHED */
5199
5200   case TRIE_next_fail: /* we failed - try next alternative */
5201   {
5202    U8 *uc;
5203    if ( ST.jump) {
5204     REGCP_UNWIND(ST.cp);
5205     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5206    }
5207    if (!--ST.accepted) {
5208     DEBUG_EXECUTE_r({
5209      PerlIO_printf( Perl_debug_log,
5210       "%*s  %sTRIE failed...%s\n",
5211       REPORT_CODE_OFF+depth*2, "",
5212       PL_colors[4],
5213       PL_colors[5] );
5214     });
5215     sayNO_SILENT;
5216    }
5217    {
5218     /* Find next-highest word to process.  Note that this code
5219     * is O(N^2) per trie run (O(N) per branch), so keep tight */
5220     U16 min = 0;
5221     U16 word;
5222     U16 const nextword = ST.nextword;
5223     reg_trie_wordinfo * const wordinfo
5224      = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5225     for (word=ST.topword; word; word=wordinfo[word].prev) {
5226      if (word > nextword && (!min || word < min))
5227       min = word;
5228     }
5229     ST.nextword = min;
5230    }
5231
5232   trie_first_try:
5233    if (do_cutgroup) {
5234     do_cutgroup = 0;
5235     no_final = 0;
5236    }
5237
5238    if ( ST.jump) {
5239     ST.lastparen = rex->lastparen;
5240     ST.lastcloseparen = rex->lastcloseparen;
5241     REGCP_SET(ST.cp);
5242    }
5243
5244    /* find start char of end of current word */
5245    {
5246     U32 chars; /* how many chars to skip */
5247     reg_trie_data * const trie
5248      = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5249
5250     assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5251        >=  ST.firstchars);
5252     chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5253        - ST.firstchars;
5254     uc = ST.firstpos;
5255
5256     if (ST.longfold) {
5257      /* the hard option - fold each char in turn and find
5258      * its folded length (which may be different */
5259      U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5260      STRLEN foldlen;
5261      STRLEN len;
5262      UV uvc;
5263      U8 *uscan;
5264
5265      while (chars) {
5266       if (utf8_target) {
5267        uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5268              uniflags);
5269        uc += len;
5270       }
5271       else {
5272        uvc = *uc;
5273        uc++;
5274       }
5275       uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5276       uscan = foldbuf;
5277       while (foldlen) {
5278        if (!--chars)
5279         break;
5280        uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5281            uniflags);
5282        uscan += len;
5283        foldlen -= len;
5284       }
5285      }
5286     }
5287     else {
5288      if (utf8_target)
5289       while (chars--)
5290        uc += UTF8SKIP(uc);
5291      else
5292       uc += chars;
5293     }
5294    }
5295
5296    scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5297        ? ST.jump[ST.nextword]
5298        : NEXT_OFF(ST.me));
5299
5300    DEBUG_EXECUTE_r({
5301     PerlIO_printf( Perl_debug_log,
5302      "%*s  %sTRIE matched word #%d, continuing%s\n",
5303      REPORT_CODE_OFF+depth*2, "",
5304      PL_colors[4],
5305      ST.nextword,
5306      PL_colors[5]
5307      );
5308    });
5309
5310    if (ST.accepted > 1 || has_cutgroup) {
5311     PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5312     /* NOTREACHED */
5313     NOT_REACHED; /* NOTREACHED */
5314    }
5315    /* only one choice left - just continue */
5316    DEBUG_EXECUTE_r({
5317     AV *const trie_words
5318      = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5319     SV ** const tmp = trie_words
5320       ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5321     SV *sv= tmp ? sv_newmortal() : NULL;
5322
5323     PerlIO_printf( Perl_debug_log,
5324      "%*s  %sonly one match left, short-circuiting: #%d <%s>%s\n",
5325      REPORT_CODE_OFF+depth*2, "", PL_colors[4],
5326      ST.nextword,
5327      tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5328        PL_colors[0], PL_colors[1],
5329        (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5330       )
5331      : "not compiled under -Dr",
5332      PL_colors[5] );
5333    });
5334
5335    locinput = (char*)uc;
5336    continue; /* execute rest of RE */
5337    /* NOTREACHED */
5338   }
5339 #undef  ST
5340
5341   case EXACTL:             /*  /abc/l       */
5342    _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5343
5344    /* Complete checking would involve going through every character
5345    * matched by the string to see if any is above latin1.  But the
5346    * comparision otherwise might very well be a fast assembly
5347    * language routine, and I (khw) don't think slowing things down
5348    * just to check for this warning is worth it.  So this just checks
5349    * the first character */
5350    if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5351     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5352    }
5353    /* FALLTHROUGH */
5354   case EXACT: {            /*  /abc/        */
5355    char *s = STRING(scan);
5356    ln = STR_LEN(scan);
5357    if (utf8_target != is_utf8_pat) {
5358     /* The target and the pattern have differing utf8ness. */
5359     char *l = locinput;
5360     const char * const e = s + ln;
5361
5362     if (utf8_target) {
5363      /* The target is utf8, the pattern is not utf8.
5364      * Above-Latin1 code points can't match the pattern;
5365      * invariants match exactly, and the other Latin1 ones need
5366      * to be downgraded to a single byte in order to do the
5367      * comparison.  (If we could be confident that the target
5368      * is not malformed, this could be refactored to have fewer
5369      * tests by just assuming that if the first bytes match, it
5370      * is an invariant, but there are tests in the test suite
5371      * dealing with (??{...}) which violate this) */
5372      while (s < e) {
5373       if (l >= reginfo->strend
5374        || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5375       {
5376        sayNO;
5377       }
5378       if (UTF8_IS_INVARIANT(*(U8*)l)) {
5379        if (*l != *s) {
5380         sayNO;
5381        }
5382        l++;
5383       }
5384       else {
5385        if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5386        {
5387         sayNO;
5388        }
5389        l += 2;
5390       }
5391       s++;
5392      }
5393     }
5394     else {
5395      /* The target is not utf8, the pattern is utf8. */
5396      while (s < e) {
5397       if (l >= reginfo->strend
5398        || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5399       {
5400        sayNO;
5401       }
5402       if (UTF8_IS_INVARIANT(*(U8*)s)) {
5403        if (*s != *l) {
5404         sayNO;
5405        }
5406        s++;
5407       }
5408       else {
5409        if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5410        {
5411         sayNO;
5412        }
5413        s += 2;
5414       }
5415       l++;
5416      }
5417     }
5418     locinput = l;
5419    }
5420    else {
5421     /* The target and the pattern have the same utf8ness. */
5422     /* Inline the first character, for speed. */
5423     if (reginfo->strend - locinput < ln
5424      || UCHARAT(s) != nextchr
5425      || (ln > 1 && memNE(s, locinput, ln)))
5426     {
5427      sayNO;
5428     }
5429     locinput += ln;
5430    }
5431    break;
5432    }
5433
5434   case EXACTFL: {          /*  /abc/il      */
5435    re_fold_t folder;
5436    const U8 * fold_array;
5437    const char * s;
5438    U32 fold_utf8_flags;
5439
5440    _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5441    folder = foldEQ_locale;
5442    fold_array = PL_fold_locale;
5443    fold_utf8_flags = FOLDEQ_LOCALE;
5444    goto do_exactf;
5445
5446   case EXACTFLU8:           /*  /abc/il; but all 'abc' are above 255, so
5447          is effectively /u; hence to match, target
5448          must be UTF-8. */
5449    if (! utf8_target) {
5450     sayNO;
5451    }
5452    fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5453            | FOLDEQ_S1_FOLDS_SANE;
5454    folder = foldEQ_latin1;
5455    fold_array = PL_fold_latin1;
5456    goto do_exactf;
5457
5458   case EXACTFU_SS:         /*  /\x{df}/iu   */
5459   case EXACTFU:            /*  /abc/iu      */
5460    folder = foldEQ_latin1;
5461    fold_array = PL_fold_latin1;
5462    fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
5463    goto do_exactf;
5464
5465   case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8
5466         patterns */
5467    assert(! is_utf8_pat);
5468    /* FALLTHROUGH */
5469   case EXACTFA:            /*  /abc/iaa     */
5470    folder = foldEQ_latin1;
5471    fold_array = PL_fold_latin1;
5472    fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5473    goto do_exactf;
5474
5475   case EXACTF:             /*  /abc/i    This node only generated for
5476            non-utf8 patterns */
5477    assert(! is_utf8_pat);
5478    folder = foldEQ;
5479    fold_array = PL_fold;
5480    fold_utf8_flags = 0;
5481
5482   do_exactf:
5483    s = STRING(scan);
5484    ln = STR_LEN(scan);
5485
5486    if (utf8_target
5487     || is_utf8_pat
5488     || state_num == EXACTFU_SS
5489     || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
5490    {
5491    /* Either target or the pattern are utf8, or has the issue where
5492    * the fold lengths may differ. */
5493     const char * const l = locinput;
5494     char *e = reginfo->strend;
5495
5496     if (! foldEQ_utf8_flags(s, 0,  ln, is_utf8_pat,
5497           l, &e, 0,  utf8_target, fold_utf8_flags))
5498     {
5499      sayNO;
5500     }
5501     locinput = e;
5502     break;
5503    }
5504
5505    /* Neither the target nor the pattern are utf8 */
5506    if (UCHARAT(s) != nextchr
5507     && !NEXTCHR_IS_EOS
5508     && UCHARAT(s) != fold_array[nextchr])
5509    {
5510     sayNO;
5511    }
5512    if (reginfo->strend - locinput < ln)
5513     sayNO;
5514    if (ln > 1 && ! folder(s, locinput, ln))
5515     sayNO;
5516    locinput += ln;
5517    break;
5518   }
5519
5520   case NBOUNDL: /*  /\B/l  */
5521    to_complement = 1;
5522    /* FALLTHROUGH */
5523
5524   case BOUNDL:  /*  /\b/l  */
5525    _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5526
5527    if (FLAGS(scan) != TRADITIONAL_BOUND) {
5528     if (! IN_UTF8_CTYPE_LOCALE) {
5529      Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
5530             B_ON_NON_UTF8_LOCALE_IS_WRONG);
5531     }
5532     goto boundu;
5533    }
5534
5535    if (utf8_target) {
5536     if (locinput == reginfo->strbeg)
5537      ln = isWORDCHAR_LC('\n');
5538     else {
5539      ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
5540               (U8*)(reginfo->strbeg)));
5541     }
5542     n = (NEXTCHR_IS_EOS)
5543      ? isWORDCHAR_LC('\n')
5544      : isWORDCHAR_LC_utf8((U8*)locinput);
5545    }
5546    else { /* Here the string isn't utf8 */
5547     ln = (locinput == reginfo->strbeg)
5548      ? isWORDCHAR_LC('\n')
5549      : isWORDCHAR_LC(UCHARAT(locinput - 1));
5550     n = (NEXTCHR_IS_EOS)
5551      ? isWORDCHAR_LC('\n')
5552      : isWORDCHAR_LC(nextchr);
5553    }
5554    if (to_complement ^ (ln == n)) {
5555     sayNO;
5556    }
5557    break;
5558
5559   case NBOUND:  /*  /\B/   */
5560    to_complement = 1;
5561    /* FALLTHROUGH */
5562
5563   case BOUND:   /*  /\b/   */
5564    if (utf8_target) {
5565     goto bound_utf8;
5566    }
5567    goto bound_ascii_match_only;
5568
5569   case NBOUNDA: /*  /\B/a  */
5570    to_complement = 1;
5571    /* FALLTHROUGH */
5572
5573   case BOUNDA:  /*  /\b/a  */
5574
5575   bound_ascii_match_only:
5576    /* Here the string isn't utf8, or is utf8 and only ascii characters
5577    * are to match \w.  In the latter case looking at the byte just
5578    * prior to the current one may be just the final byte of a
5579    * multi-byte character.  This is ok.  There are two cases:
5580    * 1) it is a single byte character, and then the test is doing
5581    *    just what it's supposed to.
5582    * 2) it is a multi-byte character, in which case the final byte is
5583    *    never mistakable for ASCII, and so the test will say it is
5584    *    not a word character, which is the correct answer. */
5585    ln = (locinput == reginfo->strbeg)
5586     ? isWORDCHAR_A('\n')
5587     : isWORDCHAR_A(UCHARAT(locinput - 1));
5588    n = (NEXTCHR_IS_EOS)
5589     ? isWORDCHAR_A('\n')
5590     : isWORDCHAR_A(nextchr);
5591    if (to_complement ^ (ln == n)) {
5592     sayNO;
5593    }
5594    break;
5595
5596   case NBOUNDU: /*  /\B/u  */
5597    to_complement = 1;
5598    /* FALLTHROUGH */
5599
5600   case BOUNDU:  /*  /\b/u  */
5601
5602   boundu:
5603    if (utf8_target) {
5604
5605    bound_utf8:
5606     switch((bound_type) FLAGS(scan)) {
5607      case TRADITIONAL_BOUND:
5608       ln = (locinput == reginfo->strbeg)
5609        ? 0 /* isWORDCHAR_L1('\n') */
5610        : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
5611                 (U8*)(reginfo->strbeg)));
5612       n = (NEXTCHR_IS_EOS)
5613        ? 0 /* isWORDCHAR_L1('\n') */
5614        : isWORDCHAR_utf8((U8*)locinput);
5615       match = cBOOL(ln != n);
5616       break;
5617      case GCB_BOUND:
5618       if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5619        match = TRUE; /* GCB always matches at begin and
5620            end */
5621       }
5622       else {
5623        /* Find the gcb values of previous and current
5624        * chars, then see if is a break point */
5625        match = isGCB(getGCB_VAL_UTF8(
5626             reghop3((U8*)locinput,
5627               -1,
5628               (U8*)(reginfo->strbeg)),
5629             (U8*) reginfo->strend),
5630           getGCB_VAL_UTF8((U8*) locinput,
5631               (U8*) reginfo->strend));
5632       }
5633       break;
5634
5635      case SB_BOUND: /* Always matches at begin and end */
5636       if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5637        match = TRUE;
5638       }
5639       else {
5640        match = isSB(getSB_VAL_UTF8(
5641             reghop3((U8*)locinput,
5642               -1,
5643               (U8*)(reginfo->strbeg)),
5644             (U8*) reginfo->strend),
5645           getSB_VAL_UTF8((U8*) locinput,
5646               (U8*) reginfo->strend),
5647           (U8*) reginfo->strbeg,
5648           (U8*) locinput,
5649           (U8*) reginfo->strend,
5650           utf8_target);
5651       }
5652       break;
5653
5654      case WB_BOUND:
5655       if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5656        match = TRUE;
5657       }
5658       else {
5659        match = isWB(WB_UNKNOWN,
5660           getWB_VAL_UTF8(
5661             reghop3((U8*)locinput,
5662               -1,
5663               (U8*)(reginfo->strbeg)),
5664             (U8*) reginfo->strend),
5665           getWB_VAL_UTF8((U8*) locinput,
5666               (U8*) reginfo->strend),
5667           (U8*) reginfo->strbeg,
5668           (U8*) locinput,
5669           (U8*) reginfo->strend,
5670           utf8_target);
5671       }
5672       break;
5673     }
5674    }
5675    else {  /* Not utf8 target */
5676     switch((bound_type) FLAGS(scan)) {
5677      case TRADITIONAL_BOUND:
5678       ln = (locinput == reginfo->strbeg)
5679        ? 0 /* isWORDCHAR_L1('\n') */
5680        : isWORDCHAR_L1(UCHARAT(locinput - 1));
5681       n = (NEXTCHR_IS_EOS)
5682        ? 0 /* isWORDCHAR_L1('\n') */
5683        : isWORDCHAR_L1(nextchr);
5684       match = cBOOL(ln != n);
5685       break;
5686
5687      case GCB_BOUND:
5688       if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5689        match = TRUE; /* GCB always matches at begin and
5690            end */
5691       }
5692       else {  /* Only CR-LF combo isn't a GCB in 0-255
5693         range */
5694        match =    UCHARAT(locinput - 1) != '\r'
5695          || UCHARAT(locinput) != '\n';
5696       }
5697       break;
5698
5699      case SB_BOUND: /* Always matches at begin and end */
5700       if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5701        match = TRUE;
5702       }
5703       else {
5704        match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
5705           getSB_VAL_CP(UCHARAT(locinput)),
5706           (U8*) reginfo->strbeg,
5707           (U8*) locinput,
5708           (U8*) reginfo->strend,
5709           utf8_target);
5710       }
5711       break;
5712
5713      case WB_BOUND:
5714       if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5715        match = TRUE;
5716       }
5717       else {
5718        match = isWB(WB_UNKNOWN,
5719           getWB_VAL_CP(UCHARAT(locinput -1)),
5720           getWB_VAL_CP(UCHARAT(locinput)),
5721           (U8*) reginfo->strbeg,
5722           (U8*) locinput,
5723           (U8*) reginfo->strend,
5724           utf8_target);
5725       }
5726       break;
5727     }
5728    }
5729
5730    if (to_complement ^ ! match) {
5731     sayNO;
5732    }
5733    break;
5734
5735   case ANYOFL:  /*  /[abc]/l      */
5736    _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5737    /* FALLTHROUGH */
5738   case ANYOF:  /*   /[abc]/       */
5739    if (NEXTCHR_IS_EOS)
5740     sayNO;
5741    if (utf8_target) {
5742     if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
5743                 utf8_target))
5744      sayNO;
5745     locinput += UTF8SKIP(locinput);
5746    }
5747    else {
5748     if (!REGINCLASS(rex, scan, (U8*)locinput))
5749      sayNO;
5750     locinput++;
5751    }
5752    break;
5753
5754   /* The argument (FLAGS) to all the POSIX node types is the class number
5755   * */
5756
5757   case NPOSIXL:   /* \W or [:^punct:] etc. under /l */
5758    to_complement = 1;
5759    /* FALLTHROUGH */
5760
5761   case POSIXL:    /* \w or [:punct:] etc. under /l */
5762    _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5763    if (NEXTCHR_IS_EOS)
5764     sayNO;
5765
5766    /* Use isFOO_lc() for characters within Latin1.  (Note that
5767    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5768    * wouldn't be invariant) */
5769    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
5770     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
5771      sayNO;
5772     }
5773    }
5774    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5775     if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
5776           (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
5777                *(locinput + 1))))))
5778     {
5779      sayNO;
5780     }
5781    }
5782    else { /* Here, must be an above Latin-1 code point */
5783     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5784     goto utf8_posix_above_latin1;
5785    }
5786
5787    /* Here, must be utf8 */
5788    locinput += UTF8SKIP(locinput);
5789    break;
5790
5791   case NPOSIXD:   /* \W or [:^punct:] etc. under /d */
5792    to_complement = 1;
5793    /* FALLTHROUGH */
5794
5795   case POSIXD:    /* \w or [:punct:] etc. under /d */
5796    if (utf8_target) {
5797     goto utf8_posix;
5798    }
5799    goto posixa;
5800
5801   case NPOSIXA:   /* \W or [:^punct:] etc. under /a */
5802
5803    if (NEXTCHR_IS_EOS) {
5804     sayNO;
5805    }
5806
5807    /* All UTF-8 variants match */
5808    if (! UTF8_IS_INVARIANT(nextchr)) {
5809     goto increment_locinput;
5810    }
5811
5812    to_complement = 1;
5813    /* FALLTHROUGH */
5814
5815   case POSIXA:    /* \w or [:punct:] etc. under /a */
5816
5817   posixa:
5818    /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
5819    * UTF-8, and also from NPOSIXA even in UTF-8 when the current
5820    * character is a single byte */
5821
5822    if (NEXTCHR_IS_EOS
5823     || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
5824                FLAGS(scan)))))
5825    {
5826     sayNO;
5827    }
5828
5829    /* Here we are either not in utf8, or we matched a utf8-invariant,
5830    * so the next char is the next byte */
5831    locinput++;
5832    break;
5833
5834   case NPOSIXU:   /* \W or [:^punct:] etc. under /u */
5835    to_complement = 1;
5836    /* FALLTHROUGH */
5837
5838   case POSIXU:    /* \w or [:punct:] etc. under /u */
5839   utf8_posix:
5840    if (NEXTCHR_IS_EOS) {
5841     sayNO;
5842    }
5843
5844    /* Use _generic_isCC() for characters within Latin1.  (Note that
5845    * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5846    * wouldn't be invariant) */
5847    if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
5848     if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
5849               FLAGS(scan)))))
5850     {
5851      sayNO;
5852     }
5853     locinput++;
5854    }
5855    else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5856     if (! (to_complement
5857      ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
5858                *(locinput + 1)),
5859            FLAGS(scan)))))
5860     {
5861      sayNO;
5862     }
5863     locinput += 2;
5864    }
5865    else {  /* Handle above Latin-1 code points */
5866    utf8_posix_above_latin1:
5867     classnum = (_char_class_number) FLAGS(scan);
5868     if (classnum < _FIRST_NON_SWASH_CC) {
5869
5870      /* Here, uses a swash to find such code points.  Load if if
5871      * not done already */
5872      if (! PL_utf8_swash_ptrs[classnum]) {
5873       U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
5874       PL_utf8_swash_ptrs[classnum]
5875         = _core_swash_init("utf8",
5876           "",
5877           &PL_sv_undef, 1, 0,
5878           PL_XPosix_ptrs[classnum], &flags);
5879      }
5880      if (! (to_complement
5881       ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
5882            (U8 *) locinput, TRUE))))
5883      {
5884       sayNO;
5885      }
5886     }
5887     else {  /* Here, uses macros to find above Latin-1 code points */
5888      switch (classnum) {
5889       case _CC_ENUM_SPACE:
5890        if (! (to_complement
5891           ^ cBOOL(is_XPERLSPACE_high(locinput))))
5892        {
5893         sayNO;
5894        }
5895        break;
5896       case _CC_ENUM_BLANK:
5897        if (! (to_complement
5898            ^ cBOOL(is_HORIZWS_high(locinput))))
5899        {
5900         sayNO;
5901        }
5902        break;
5903       case _CC_ENUM_XDIGIT:
5904        if (! (to_complement
5905            ^ cBOOL(is_XDIGIT_high(locinput))))
5906        {
5907         sayNO;
5908        }
5909        break;
5910       case _CC_ENUM_VERTSPACE:
5911        if (! (to_complement
5912            ^ cBOOL(is_VERTWS_high(locinput))))
5913        {
5914         sayNO;
5915        }
5916        break;
5917       default:    /* The rest, e.g. [:cntrl:], can't match
5918          above Latin1 */
5919        if (! to_complement) {
5920         sayNO;
5921        }
5922        break;
5923      }
5924     }
5925     locinput += UTF8SKIP(locinput);
5926    }
5927    break;
5928
5929   case CLUMP: /* Match \X: logical Unicode character.  This is defined as
5930      a Unicode extended Grapheme Cluster */
5931    if (NEXTCHR_IS_EOS)
5932     sayNO;
5933    if  (! utf8_target) {
5934
5935     /* Match either CR LF  or '.', as all the other possibilities
5936     * require utf8 */
5937     locinput++;     /* Match the . or CR */
5938     if (nextchr == '\r' /* And if it was CR, and the next is LF,
5939          match the LF */
5940      && locinput < reginfo->strend
5941      && UCHARAT(locinput) == '\n')
5942     {
5943      locinput++;
5944     }
5945    }
5946    else {
5947
5948     /* Get the gcb type for the current character */
5949     GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
5950              (U8*) reginfo->strend);
5951
5952     /* Then scan through the input until we get to the first
5953     * character whose type is supposed to be a gcb with the
5954     * current character.  (There is always a break at the
5955     * end-of-input) */
5956     locinput += UTF8SKIP(locinput);
5957     while (locinput < reginfo->strend) {
5958      GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
5959               (U8*) reginfo->strend);
5960      if (isGCB(prev_gcb, cur_gcb)) {
5961       break;
5962      }
5963
5964      prev_gcb = cur_gcb;
5965      locinput += UTF8SKIP(locinput);
5966     }
5967
5968
5969    }
5970    break;
5971
5972   case NREFFL:  /*  /\g{name}/il  */
5973   {   /* The capture buffer cases.  The ones beginning with N for the
5974    named buffers just convert to the equivalent numbered and
5975    pretend they were called as the corresponding numbered buffer
5976    op.  */
5977    /* don't initialize these in the declaration, it makes C++
5978    unhappy */
5979    const char *s;
5980    char type;
5981    re_fold_t folder;
5982    const U8 *fold_array;
5983    UV utf8_fold_flags;
5984
5985    _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5986    folder = foldEQ_locale;
5987    fold_array = PL_fold_locale;
5988    type = REFFL;
5989    utf8_fold_flags = FOLDEQ_LOCALE;
5990    goto do_nref;
5991
5992   case NREFFA:  /*  /\g{name}/iaa  */
5993    folder = foldEQ_latin1;
5994    fold_array = PL_fold_latin1;
5995    type = REFFA;
5996    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5997    goto do_nref;
5998
5999   case NREFFU:  /*  /\g{name}/iu  */
6000    folder = foldEQ_latin1;
6001    fold_array = PL_fold_latin1;
6002    type = REFFU;
6003    utf8_fold_flags = 0;
6004    goto do_nref;
6005
6006   case NREFF:  /*  /\g{name}/i  */
6007    folder = foldEQ;
6008    fold_array = PL_fold;
6009    type = REFF;
6010    utf8_fold_flags = 0;
6011    goto do_nref;
6012
6013   case NREF:  /*  /\g{name}/   */
6014    type = REF;
6015    folder = NULL;
6016    fold_array = NULL;
6017    utf8_fold_flags = 0;
6018   do_nref:
6019
6020    /* For the named back references, find the corresponding buffer
6021    * number */
6022    n = reg_check_named_buff_matched(rex,scan);
6023
6024    if ( ! n ) {
6025     sayNO;
6026    }
6027    goto do_nref_ref_common;
6028
6029   case REFFL:  /*  /\1/il  */
6030    _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6031    folder = foldEQ_locale;
6032    fold_array = PL_fold_locale;
6033    utf8_fold_flags = FOLDEQ_LOCALE;
6034    goto do_ref;
6035
6036   case REFFA:  /*  /\1/iaa  */
6037    folder = foldEQ_latin1;
6038    fold_array = PL_fold_latin1;
6039    utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6040    goto do_ref;
6041
6042   case REFFU:  /*  /\1/iu  */
6043    folder = foldEQ_latin1;
6044    fold_array = PL_fold_latin1;
6045    utf8_fold_flags = 0;
6046    goto do_ref;
6047
6048   case REFF:  /*  /\1/i  */
6049    folder = foldEQ;
6050    fold_array = PL_fold;
6051    utf8_fold_flags = 0;
6052    goto do_ref;
6053
6054   case REF:  /*  /\1/    */
6055    folder = NULL;
6056    fold_array = NULL;
6057    utf8_fold_flags = 0;
6058
6059   do_ref:
6060    type = OP(scan);
6061    n = ARG(scan);  /* which paren pair */
6062
6063   do_nref_ref_common:
6064    ln = rex->offs[n].start;
6065    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6066    if (rex->lastparen < n || ln == -1)
6067     sayNO;   /* Do not match unless seen CLOSEn. */
6068    if (ln == rex->offs[n].end)
6069     break;
6070
6071    s = reginfo->strbeg + ln;
6072    if (type != REF /* REF can do byte comparison */
6073     && (utf8_target || type == REFFU || type == REFFL))
6074    {
6075     char * limit = reginfo->strend;
6076
6077     /* This call case insensitively compares the entire buffer
6078      * at s, with the current input starting at locinput, but
6079      * not going off the end given by reginfo->strend, and
6080      * returns in <limit> upon success, how much of the
6081      * current input was matched */
6082     if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
6083          locinput, &limit, 0, utf8_target, utf8_fold_flags))
6084     {
6085      sayNO;
6086     }
6087     locinput = limit;
6088     break;
6089    }
6090
6091    /* Not utf8:  Inline the first character, for speed. */
6092    if (!NEXTCHR_IS_EOS &&
6093     UCHARAT(s) != nextchr &&
6094     (type == REF ||
6095     UCHARAT(s) != fold_array[nextchr]))
6096     sayNO;
6097    ln = rex->offs[n].end - ln;
6098    if (locinput + ln > reginfo->strend)
6099     sayNO;
6100    if (ln > 1 && (type == REF
6101       ? memNE(s, locinput, ln)
6102       : ! folder(s, locinput, ln)))
6103     sayNO;
6104    locinput += ln;
6105    break;
6106   }
6107
6108   case NOTHING: /* null op; e.g. the 'nothing' following
6109      * the '*' in m{(a+|b)*}' */
6110    break;
6111   case TAIL: /* placeholder while compiling (A|B|C) */
6112    break;
6113
6114 #undef  ST
6115 #define ST st->u.eval
6116   {
6117    SV *ret;
6118    REGEXP *re_sv;
6119    regexp *re;
6120    regexp_internal *rei;
6121    regnode *startpoint;
6122
6123   case GOSTART: /*  (?R)  */
6124   case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
6125    if (cur_eval && cur_eval->locinput==locinput) {
6126     if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
6127      Perl_croak(aTHX_ "Infinite recursion in regex");
6128     if ( ++nochange_depth > max_nochange_depth )
6129      Perl_croak(aTHX_
6130       "Pattern subroutine nesting without pos change"
6131       " exceeded limit in regex");
6132    } else {
6133     nochange_depth = 0;
6134    }
6135    re_sv = rex_sv;
6136    re = rex;
6137    rei = rexi;
6138    if (OP(scan)==GOSUB) {
6139     startpoint = scan + ARG2L(scan);
6140     ST.close_paren = ARG(scan);
6141    } else {
6142     startpoint = rei->program+1;
6143     ST.close_paren = 0;
6144    }
6145
6146    /* Save all the positions seen so far. */
6147    ST.cp = regcppush(rex, 0, maxopenparen);
6148    REGCP_SET(ST.lastcp);
6149
6150    /* and then jump to the code we share with EVAL */
6151    goto eval_recurse_doit;
6152    /* NOTREACHED */
6153
6154   case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
6155    if (cur_eval && cur_eval->locinput==locinput) {
6156     if ( ++nochange_depth > max_nochange_depth )
6157      Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6158    } else {
6159     nochange_depth = 0;
6160    }
6161    {
6162     /* execute the code in the {...} */
6163
6164     dSP;
6165     IV before;
6166     OP * const oop = PL_op;
6167     COP * const ocurcop = PL_curcop;
6168     OP *nop;
6169     CV *newcv;
6170
6171     /* save *all* paren positions */
6172     regcppush(rex, 0, maxopenparen);
6173     REGCP_SET(runops_cp);
6174
6175     if (!caller_cv)
6176      caller_cv = find_runcv(NULL);
6177
6178     n = ARG(scan);
6179
6180     if (rexi->data->what[n] == 'r') { /* code from an external qr */
6181      newcv = (ReANY(
6182             (REGEXP*)(rexi->data->data[n])
6183            ))->qr_anoncv
6184           ;
6185      nop = (OP*)rexi->data->data[n+1];
6186     }
6187     else if (rexi->data->what[n] == 'l') { /* literal code */
6188      newcv = caller_cv;
6189      nop = (OP*)rexi->data->data[n];
6190      assert(CvDEPTH(newcv));
6191     }
6192     else {
6193      /* literal with own CV */
6194      assert(rexi->data->what[n] == 'L');
6195      newcv = rex->qr_anoncv;
6196      nop = (OP*)rexi->data->data[n];
6197     }
6198
6199     /* normally if we're about to execute code from the same
6200     * CV that we used previously, we just use the existing
6201     * CX stack entry. However, its possible that in the
6202     * meantime we may have backtracked, popped from the save
6203     * stack, and undone the SAVECOMPPAD(s) associated with
6204     * PUSH_MULTICALL; in which case PL_comppad no longer
6205     * points to newcv's pad. */
6206     if (newcv != last_pushed_cv || PL_comppad != last_pad)
6207     {
6208      U8 flags = (CXp_SUB_RE |
6209         ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6210      if (last_pushed_cv) {
6211       CHANGE_MULTICALL_FLAGS(newcv, flags);
6212      }
6213      else {
6214       PUSH_MULTICALL_FLAGS(newcv, flags);
6215      }
6216      last_pushed_cv = newcv;
6217     }
6218     else {
6219      /* these assignments are just to silence compiler
6220      * warnings */
6221      multicall_cop = NULL;
6222      newsp = NULL;
6223     }
6224     last_pad = PL_comppad;
6225
6226     /* the initial nextstate you would normally execute
6227     * at the start of an eval (which would cause error
6228     * messages to come from the eval), may be optimised
6229     * away from the execution path in the regex code blocks;
6230     * so manually set PL_curcop to it initially */
6231     {
6232      OP *o = cUNOPx(nop)->op_first;
6233      assert(o->op_type == OP_NULL);
6234      if (o->op_targ == OP_SCOPE) {
6235       o = cUNOPo->op_first;
6236      }
6237      else {
6238       assert(o->op_targ == OP_LEAVE);
6239       o = cUNOPo->op_first;
6240       assert(o->op_type == OP_ENTER);
6241       o = OpSIBLING(o);
6242      }
6243
6244      if (o->op_type != OP_STUB) {
6245       assert(    o->op_type == OP_NEXTSTATE
6246         || o->op_type == OP_DBSTATE
6247         || (o->op_type == OP_NULL
6248          &&  (  o->op_targ == OP_NEXTSTATE
6249           || o->op_targ == OP_DBSTATE
6250           )
6251          )
6252       );
6253       PL_curcop = (COP*)o;
6254      }
6255     }
6256     nop = nop->op_next;
6257
6258     DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
6259      "  re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
6260
6261     rex->offs[0].end = locinput - reginfo->strbeg;
6262     if (reginfo->info_aux_eval->pos_magic)
6263      MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6264         reginfo->sv, reginfo->strbeg,
6265         locinput - reginfo->strbeg);
6266
6267     if (sv_yes_mark) {
6268      SV *sv_mrk = get_sv("REGMARK", 1);
6269      sv_setsv(sv_mrk, sv_yes_mark);
6270     }
6271
6272     /* we don't use MULTICALL here as we want to call the
6273     * first op of the block of interest, rather than the
6274     * first op of the sub */
6275     before = (IV)(SP-PL_stack_base);
6276     PL_op = nop;
6277     CALLRUNOPS(aTHX);   /* Scalar context. */
6278     SPAGAIN;
6279     if ((IV)(SP-PL_stack_base) == before)
6280      ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
6281     else {
6282      ret = POPs;
6283      PUTBACK;
6284     }
6285
6286     /* before restoring everything, evaluate the returned
6287     * value, so that 'uninit' warnings don't use the wrong
6288     * PL_op or pad. Also need to process any magic vars
6289     * (e.g. $1) *before* parentheses are restored */
6290
6291     PL_op = NULL;
6292
6293     re_sv = NULL;
6294     if (logical == 0)        /*   (?{})/   */
6295      sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6296     else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
6297      sw = cBOOL(SvTRUE(ret));
6298      logical = 0;
6299     }
6300     else {                   /*  /(??{})  */
6301      /*  if its overloaded, let the regex compiler handle
6302      *  it; otherwise extract regex, or stringify  */
6303      if (SvGMAGICAL(ret))
6304       ret = sv_mortalcopy(ret);
6305      if (!SvAMAGIC(ret)) {
6306       SV *sv = ret;
6307       if (SvROK(sv))
6308        sv = SvRV(sv);
6309       if (SvTYPE(sv) == SVt_REGEXP)
6310        re_sv = (REGEXP*) sv;
6311       else if (SvSMAGICAL(ret)) {
6312        MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
6313        if (mg)
6314         re_sv = (REGEXP *) mg->mg_obj;
6315       }
6316
6317       /* force any undef warnings here */
6318       if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
6319        ret = sv_mortalcopy(ret);
6320        (void) SvPV_force_nolen(ret);
6321       }
6322      }
6323
6324     }
6325
6326     /* *** Note that at this point we don't restore
6327     * PL_comppad, (or pop the CxSUB) on the assumption it may
6328     * be used again soon. This is safe as long as nothing
6329     * in the regexp code uses the pad ! */
6330     PL_op = oop;
6331     PL_curcop = ocurcop;
6332     S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
6333     PL_curpm = PL_reg_curpm;
6334
6335     if (logical != 2)
6336      break;
6337    }
6338
6339     /* only /(??{})/  from now on */
6340     logical = 0;
6341     {
6342      /* extract RE object from returned value; compiling if
6343      * necessary */
6344
6345      if (re_sv) {
6346       re_sv = reg_temp_copy(NULL, re_sv);
6347      }
6348      else {
6349       U32 pm_flags = 0;
6350
6351       if (SvUTF8(ret) && IN_BYTES) {
6352        /* In use 'bytes': make a copy of the octet
6353        * sequence, but without the flag on */
6354        STRLEN len;
6355        const char *const p = SvPV(ret, len);
6356        ret = newSVpvn_flags(p, len, SVs_TEMP);
6357       }
6358       if (rex->intflags & PREGf_USE_RE_EVAL)
6359        pm_flags |= PMf_USE_RE_EVAL;
6360
6361       /* if we got here, it should be an engine which
6362       * supports compiling code blocks and stuff */
6363       assert(rex->engine && rex->engine->op_comp);
6364       assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
6365       re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
6366          rex->engine, NULL, NULL,
6367          /* copy /msixn etc to inner pattern */
6368          ARG2L(scan),
6369          pm_flags);
6370
6371       if (!(SvFLAGS(ret)
6372        & (SVs_TEMP | SVs_GMG | SVf_ROK))
6373       && (!SvPADTMP(ret) || SvREADONLY(ret))) {
6374        /* This isn't a first class regexp. Instead, it's
6375        caching a regexp onto an existing, Perl visible
6376        scalar.  */
6377        sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
6378       }
6379      }
6380      SAVEFREESV(re_sv);
6381      re = ReANY(re_sv);
6382     }
6383     RXp_MATCH_COPIED_off(re);
6384     re->subbeg = rex->subbeg;
6385     re->sublen = rex->sublen;
6386     re->suboffset = rex->suboffset;
6387     re->subcoffset = rex->subcoffset;
6388     re->lastparen = 0;
6389     re->lastcloseparen = 0;
6390     rei = RXi_GET(re);
6391     DEBUG_EXECUTE_r(
6392      debug_start_match(re_sv, utf8_target, locinput,
6393          reginfo->strend, "Matching embedded");
6394     );
6395     startpoint = rei->program + 1;
6396      ST.close_paren = 0; /* only used for GOSUB */
6397     /* Save all the seen positions so far. */
6398     ST.cp = regcppush(rex, 0, maxopenparen);
6399     REGCP_SET(ST.lastcp);
6400     /* and set maxopenparen to 0, since we are starting a "fresh" match */
6401     maxopenparen = 0;
6402     /* run the pattern returned from (??{...}) */
6403
6404    eval_recurse_doit: /* Share code with GOSUB below this line
6405        * At this point we expect the stack context to be
6406        * set up correctly */
6407
6408     /* invalidate the S-L poscache. We're now executing a
6409     * different set of WHILEM ops (and their associated
6410     * indexes) against the same string, so the bits in the
6411     * cache are meaningless. Setting maxiter to zero forces
6412     * the cache to be invalidated and zeroed before reuse.
6413     * XXX This is too dramatic a measure. Ideally we should
6414     * save the old cache and restore when running the outer
6415     * pattern again */
6416     reginfo->poscache_maxiter = 0;
6417
6418     /* the new regexp might have a different is_utf8_pat than we do */
6419     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
6420
6421     ST.prev_rex = rex_sv;
6422     ST.prev_curlyx = cur_curlyx;
6423     rex_sv = re_sv;
6424     SET_reg_curpm(rex_sv);
6425     rex = re;
6426     rexi = rei;
6427     cur_curlyx = NULL;
6428     ST.B = next;
6429     ST.prev_eval = cur_eval;
6430     cur_eval = st;
6431     /* now continue from first node in postoned RE */
6432     PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
6433     /* NOTREACHED */
6434     NOT_REACHED; /* NOTREACHED */
6435   }
6436
6437   case EVAL_AB: /* cleanup after a successful (??{A})B */
6438    /* note: this is called twice; first after popping B, then A */
6439    rex_sv = ST.prev_rex;
6440    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6441    SET_reg_curpm(rex_sv);
6442    rex = ReANY(rex_sv);
6443    rexi = RXi_GET(rex);
6444    {
6445     /* preserve $^R across LEAVE's. See Bug 121070. */
6446     SV *save_sv= GvSV(PL_replgv);
6447     SvREFCNT_inc(save_sv);
6448     regcpblow(ST.cp); /* LEAVE in disguise */
6449     sv_setsv(GvSV(PL_replgv), save_sv);
6450     SvREFCNT_dec(save_sv);
6451    }
6452    cur_eval = ST.prev_eval;
6453    cur_curlyx = ST.prev_curlyx;
6454
6455    /* Invalidate cache. See "invalidate" comment above. */
6456    reginfo->poscache_maxiter = 0;
6457    if ( nochange_depth )
6458     nochange_depth--;
6459    sayYES;
6460
6461
6462   case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
6463    /* note: this is called twice; first after popping B, then A */
6464    rex_sv = ST.prev_rex;
6465    is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6466    SET_reg_curpm(rex_sv);
6467    rex = ReANY(rex_sv);
6468    rexi = RXi_GET(rex);
6469
6470    REGCP_UNWIND(ST.lastcp);
6471    regcppop(rex, &maxopenparen);
6472    cur_eval = ST.prev_eval;
6473    cur_curlyx = ST.prev_curlyx;
6474    /* Invalidate cache. See "invalidate" comment above. */
6475    reginfo->poscache_maxiter = 0;
6476    if ( nochange_depth )
6477     nochange_depth--;
6478    sayNO_SILENT;
6479 #undef ST
6480
6481   case OPEN: /*  (  */
6482    n = ARG(scan);  /* which paren pair */
6483    rex->offs[n].start_tmp = locinput - reginfo->strbeg;
6484    if (n > maxopenparen)
6485     maxopenparen = n;
6486    DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
6487     "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
6488     PTR2UV(rex),
6489     PTR2UV(rex->offs),
6490     (UV)n,
6491     (IV)rex->offs[n].start_tmp,
6492     (UV)maxopenparen
6493    ));
6494    lastopen = n;
6495    break;
6496
6497 /* XXX really need to log other places start/end are set too */
6498 #define CLOSE_CAPTURE \
6499  rex->offs[n].start = rex->offs[n].start_tmp; \
6500  rex->offs[n].end = locinput - reginfo->strbeg; \
6501  DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
6502   "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
6503   PTR2UV(rex), \
6504   PTR2UV(rex->offs), \
6505   (UV)n, \
6506   (IV)rex->offs[n].start, \
6507   (IV)rex->offs[n].end \
6508  ))
6509
6510   case CLOSE:  /*  )  */
6511    n = ARG(scan);  /* which paren pair */
6512    CLOSE_CAPTURE;
6513    if (n > rex->lastparen)
6514     rex->lastparen = n;
6515    rex->lastcloseparen = n;
6516    if (cur_eval && cur_eval->u.eval.close_paren == n) {
6517     goto fake_end;
6518    }
6519    break;
6520
6521   case ACCEPT:  /*  (*ACCEPT)  */
6522    if (ARG(scan)){
6523     regnode *cursor;
6524     for (cursor=scan;
6525      cursor && OP(cursor)!=END;
6526      cursor=regnext(cursor))
6527     {
6528      if ( OP(cursor)==CLOSE ){
6529       n = ARG(cursor);
6530       if ( n <= lastopen ) {
6531        CLOSE_CAPTURE;
6532        if (n > rex->lastparen)
6533         rex->lastparen = n;
6534        rex->lastcloseparen = n;
6535        if ( n == ARG(scan) || (cur_eval &&
6536         cur_eval->u.eval.close_paren == n))
6537         break;
6538       }
6539      }
6540     }
6541    }
6542    goto fake_end;
6543    /* NOTREACHED */
6544
6545   case GROUPP:  /*  (?(1))  */
6546    n = ARG(scan);  /* which paren pair */
6547    sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
6548    break;
6549
6550   case NGROUPP:  /*  (?(<name>))  */
6551    /* reg_check_named_buff_matched returns 0 for no match */
6552    sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
6553    break;
6554
6555   case INSUBP:   /*  (?(R))  */
6556    n = ARG(scan);
6557    sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
6558    break;
6559
6560   case DEFINEP:  /*  (?(DEFINE))  */
6561    sw = 0;
6562    break;
6563
6564   case IFTHEN:   /*  (?(cond)A|B)  */
6565    reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6566    if (sw)
6567     next = NEXTOPER(NEXTOPER(scan));
6568    else {
6569     next = scan + ARG(scan);
6570     if (OP(next) == IFTHEN) /* Fake one. */
6571      next = NEXTOPER(NEXTOPER(next));
6572    }
6573    break;
6574
6575   case LOGICAL:  /* modifier for EVAL and IFMATCH */
6576    logical = scan->flags;
6577    break;
6578
6579 /*******************************************************************
6580
6581 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
6582 pattern, where A and B are subpatterns. (For simple A, CURLYM or
6583 STAR/PLUS/CURLY/CURLYN are used instead.)
6584
6585 A*B is compiled as <CURLYX><A><WHILEM><B>
6586
6587 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
6588 state, which contains the current count, initialised to -1. It also sets
6589 cur_curlyx to point to this state, with any previous value saved in the
6590 state block.
6591
6592 CURLYX then jumps straight to the WHILEM op, rather than executing A,
6593 since the pattern may possibly match zero times (i.e. it's a while {} loop
6594 rather than a do {} while loop).
6595
6596 Each entry to WHILEM represents a successful match of A. The count in the
6597 CURLYX block is incremented, another WHILEM state is pushed, and execution
6598 passes to A or B depending on greediness and the current count.
6599
6600 For example, if matching against the string a1a2a3b (where the aN are
6601 substrings that match /A/), then the match progresses as follows: (the
6602 pushed states are interspersed with the bits of strings matched so far):
6603
6604  <CURLYX cnt=-1>
6605  <CURLYX cnt=0><WHILEM>
6606  <CURLYX cnt=1><WHILEM> a1 <WHILEM>
6607  <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
6608  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
6609  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
6610
6611 (Contrast this with something like CURLYM, which maintains only a single
6612 backtrack state:
6613
6614  <CURLYM cnt=0> a1
6615  a1 <CURLYM cnt=1> a2
6616  a1 a2 <CURLYM cnt=2> a3
6617  a1 a2 a3 <CURLYM cnt=3> b
6618 )
6619
6620 Each WHILEM state block marks a point to backtrack to upon partial failure
6621 of A or B, and also contains some minor state data related to that
6622 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
6623 overall state, such as the count, and pointers to the A and B ops.
6624
6625 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
6626 must always point to the *current* CURLYX block, the rules are:
6627
6628 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
6629 and set cur_curlyx to point the new block.
6630
6631 When popping the CURLYX block after a successful or unsuccessful match,
6632 restore the previous cur_curlyx.
6633
6634 When WHILEM is about to execute B, save the current cur_curlyx, and set it
6635 to the outer one saved in the CURLYX block.
6636
6637 When popping the WHILEM block after a successful or unsuccessful B match,
6638 restore the previous cur_curlyx.
6639
6640 Here's an example for the pattern (AI* BI)*BO
6641 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
6642
6643 cur_
6644 curlyx backtrack stack
6645 ------ ---------------
6646 NULL
6647 CO     <CO prev=NULL> <WO>
6648 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6649 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6650 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
6651
6652 At this point the pattern succeeds, and we work back down the stack to
6653 clean up, restoring as we go:
6654
6655 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6656 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6657 CO     <CO prev=NULL> <WO>
6658 NULL
6659
6660 *******************************************************************/
6661
6662 #define ST st->u.curlyx
6663
6664   case CURLYX:    /* start of /A*B/  (for complex A) */
6665   {
6666    /* No need to save/restore up to this paren */
6667    I32 parenfloor = scan->flags;
6668
6669    assert(next); /* keep Coverity happy */
6670    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
6671     next += ARG(next);
6672
6673    /* XXXX Probably it is better to teach regpush to support
6674    parenfloor > maxopenparen ... */
6675    if (parenfloor > (I32)rex->lastparen)
6676     parenfloor = rex->lastparen; /* Pessimization... */
6677
6678    ST.prev_curlyx= cur_curlyx;
6679    cur_curlyx = st;
6680    ST.cp = PL_savestack_ix;
6681
6682    /* these fields contain the state of the current curly.
6683    * they are accessed by subsequent WHILEMs */
6684    ST.parenfloor = parenfloor;
6685    ST.me = scan;
6686    ST.B = next;
6687    ST.minmod = minmod;
6688    minmod = 0;
6689    ST.count = -1; /* this will be updated by WHILEM */
6690    ST.lastloc = NULL;  /* this will be updated by WHILEM */
6691
6692    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
6693    /* NOTREACHED */
6694    NOT_REACHED; /* NOTREACHED */
6695   }
6696
6697   case CURLYX_end: /* just finished matching all of A*B */
6698    cur_curlyx = ST.prev_curlyx;
6699    sayYES;
6700    /* NOTREACHED */
6701    NOT_REACHED; /* NOTREACHED */
6702
6703   case CURLYX_end_fail: /* just failed to match all of A*B */
6704    regcpblow(ST.cp);
6705    cur_curlyx = ST.prev_curlyx;
6706    sayNO;
6707    /* NOTREACHED */
6708    NOT_REACHED; /* NOTREACHED */
6709
6710
6711 #undef ST
6712 #define ST st->u.whilem
6713
6714   case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
6715   {
6716    /* see the discussion above about CURLYX/WHILEM */
6717    I32 n;
6718    int min, max;
6719    regnode *A;
6720
6721    assert(cur_curlyx); /* keep Coverity happy */
6722
6723    min = ARG1(cur_curlyx->u.curlyx.me);
6724    max = ARG2(cur_curlyx->u.curlyx.me);
6725    A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
6726    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
6727    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
6728    ST.cache_offset = 0;
6729    ST.cache_mask = 0;
6730
6731
6732    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6733     "%*s  whilem: matched %ld out of %d..%d\n",
6734     REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
6735    );
6736
6737    /* First just match a string of min A's. */
6738
6739    if (n < min) {
6740     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6741          maxopenparen);
6742     cur_curlyx->u.curlyx.lastloc = locinput;
6743     REGCP_SET(ST.lastcp);
6744
6745     PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
6746     /* NOTREACHED */
6747     NOT_REACHED; /* NOTREACHED */
6748    }
6749
6750    /* If degenerate A matches "", assume A done. */
6751
6752    if (locinput == cur_curlyx->u.curlyx.lastloc) {
6753     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6754     "%*s  whilem: empty match detected, trying continuation...\n",
6755     REPORT_CODE_OFF+depth*2, "")
6756     );
6757     goto do_whilem_B_max;
6758    }
6759
6760    /* super-linear cache processing.
6761    *
6762    * The idea here is that for certain types of CURLYX/WHILEM -
6763    * principally those whose upper bound is infinity (and
6764    * excluding regexes that have things like \1 and other very
6765    * non-regular expresssiony things), then if a pattern like
6766    * /....A*.../ fails and we backtrack to the WHILEM, then we
6767    * make a note that this particular WHILEM op was at string
6768    * position 47 (say) when the rest of pattern failed. Then, if
6769    * we ever find ourselves back at that WHILEM, and at string
6770    * position 47 again, we can just fail immediately rather than
6771    * running the rest of the pattern again.
6772    *
6773    * This is very handy when patterns start to go
6774    * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
6775    * with a combinatorial explosion of backtracking.
6776    *
6777    * The cache is implemented as a bit array, with one bit per
6778    * string byte position per WHILEM op (up to 16) - so its
6779    * between 0.25 and 2x the string size.
6780    *
6781    * To avoid allocating a poscache buffer every time, we do an
6782    * initially countdown; only after we have  executed a WHILEM
6783    * op (string-length x #WHILEMs) times do we allocate the
6784    * cache.
6785    *
6786    * The top 4 bits of scan->flags byte say how many different
6787    * relevant CURLLYX/WHILEM op pairs there are, while the
6788    * bottom 4-bits is the identifying index number of this
6789    * WHILEM.
6790    */
6791
6792    if (scan->flags) {
6793
6794     if (!reginfo->poscache_maxiter) {
6795      /* start the countdown: Postpone detection until we
6796      * know the match is not *that* much linear. */
6797      reginfo->poscache_maxiter
6798       =    (reginfo->strend - reginfo->strbeg + 1)
6799       * (scan->flags>>4);
6800      /* possible overflow for long strings and many CURLYX's */
6801      if (reginfo->poscache_maxiter < 0)
6802       reginfo->poscache_maxiter = I32_MAX;
6803      reginfo->poscache_iter = reginfo->poscache_maxiter;
6804     }
6805
6806     if (reginfo->poscache_iter-- == 0) {
6807      /* initialise cache */
6808      const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
6809      regmatch_info_aux *const aux = reginfo->info_aux;
6810      if (aux->poscache) {
6811       if ((SSize_t)reginfo->poscache_size < size) {
6812        Renew(aux->poscache, size, char);
6813        reginfo->poscache_size = size;
6814       }
6815       Zero(aux->poscache, size, char);
6816      }
6817      else {
6818       reginfo->poscache_size = size;
6819       Newxz(aux->poscache, size, char);
6820      }
6821      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6822  "%swhilem: Detected a super-linear match, switching on caching%s...\n",
6823        PL_colors[4], PL_colors[5])
6824      );
6825     }
6826
6827     if (reginfo->poscache_iter < 0) {
6828      /* have we already failed at this position? */
6829      SSize_t offset, mask;
6830
6831      reginfo->poscache_iter = -1; /* stop eventual underflow */
6832      offset  = (scan->flags & 0xf) - 1
6833         +   (locinput - reginfo->strbeg)
6834         * (scan->flags>>4);
6835      mask    = 1 << (offset % 8);
6836      offset /= 8;
6837      if (reginfo->info_aux->poscache[offset] & mask) {
6838       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6839        "%*s  whilem: (cache) already tried at this position...\n",
6840        REPORT_CODE_OFF+depth*2, "")
6841       );
6842       sayNO; /* cache records failure */
6843      }
6844      ST.cache_offset = offset;
6845      ST.cache_mask   = mask;
6846     }
6847    }
6848
6849    /* Prefer B over A for minimal matching. */
6850
6851    if (cur_curlyx->u.curlyx.minmod) {
6852     ST.save_curlyx = cur_curlyx;
6853     cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6854     ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
6855        maxopenparen);
6856     REGCP_SET(ST.lastcp);
6857     PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
6858          locinput);
6859     /* NOTREACHED */
6860     NOT_REACHED; /* NOTREACHED */
6861    }
6862
6863    /* Prefer A over B for maximal matching. */
6864
6865    if (n < max) { /* More greed allowed? */
6866     ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6867        maxopenparen);
6868     cur_curlyx->u.curlyx.lastloc = locinput;
6869     REGCP_SET(ST.lastcp);
6870     PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
6871     /* NOTREACHED */
6872     NOT_REACHED; /* NOTREACHED */
6873    }
6874    goto do_whilem_B_max;
6875   }
6876   /* NOTREACHED */
6877   NOT_REACHED; /* NOTREACHED */
6878
6879   case WHILEM_B_min: /* just matched B in a minimal match */
6880   case WHILEM_B_max: /* just matched B in a maximal match */
6881    cur_curlyx = ST.save_curlyx;
6882    sayYES;
6883    /* NOTREACHED */
6884    NOT_REACHED; /* NOTREACHED */
6885
6886   case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
6887    cur_curlyx = ST.save_curlyx;
6888    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6889    cur_curlyx->u.curlyx.count--;
6890    CACHEsayNO;
6891    /* NOTREACHED */
6892    NOT_REACHED; /* NOTREACHED */
6893
6894   case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
6895    /* FALLTHROUGH */
6896   case WHILEM_A_pre_fail: /* just failed to match even minimal A */
6897    REGCP_UNWIND(ST.lastcp);
6898    regcppop(rex, &maxopenparen);
6899    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6900    cur_curlyx->u.curlyx.count--;
6901    CACHEsayNO;
6902    /* NOTREACHED */
6903    NOT_REACHED; /* NOTREACHED */
6904
6905   case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
6906    REGCP_UNWIND(ST.lastcp);
6907    regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
6908    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6909     "%*s  whilem: failed, trying continuation...\n",
6910     REPORT_CODE_OFF+depth*2, "")
6911    );
6912   do_whilem_B_max:
6913    if (cur_curlyx->u.curlyx.count >= REG_INFTY
6914     && ckWARN(WARN_REGEXP)
6915     && !reginfo->warned)
6916    {
6917     reginfo->warned = TRUE;
6918     Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6919      "Complex regular subexpression recursion limit (%d) "
6920      "exceeded",
6921      REG_INFTY - 1);
6922    }
6923
6924    /* now try B */
6925    ST.save_curlyx = cur_curlyx;
6926    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6927    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
6928         locinput);
6929    /* NOTREACHED */
6930    NOT_REACHED; /* NOTREACHED */
6931
6932   case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
6933    cur_curlyx = ST.save_curlyx;
6934    REGCP_UNWIND(ST.lastcp);
6935    regcppop(rex, &maxopenparen);
6936
6937    if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
6938     /* Maximum greed exceeded */
6939     if (cur_curlyx->u.curlyx.count >= REG_INFTY
6940      && ckWARN(WARN_REGEXP)
6941      && !reginfo->warned)
6942     {
6943      reginfo->warned = TRUE;
6944      Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6945       "Complex regular subexpression recursion "
6946       "limit (%d) exceeded",
6947       REG_INFTY - 1);
6948     }
6949     cur_curlyx->u.curlyx.count--;
6950     CACHEsayNO;
6951    }
6952
6953    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6954     "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6955    );
6956    /* Try grabbing another A and see if it helps. */
6957    cur_curlyx->u.curlyx.lastloc = locinput;
6958    ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6959        maxopenparen);
6960    REGCP_SET(ST.lastcp);
6961    PUSH_STATE_GOTO(WHILEM_A_min,
6962     /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6963     locinput);
6964    /* NOTREACHED */
6965    NOT_REACHED; /* NOTREACHED */
6966
6967 #undef  ST
6968 #define ST st->u.branch
6969
6970   case BRANCHJ:     /*  /(...|A|...)/ with long next pointer */
6971    next = scan + ARG(scan);
6972    if (next == scan)
6973     next = NULL;
6974    scan = NEXTOPER(scan);
6975    /* FALLTHROUGH */
6976
6977   case BRANCH:     /*  /(...|A|...)/ */
6978    scan = NEXTOPER(scan); /* scan now points to inner node */
6979    ST.lastparen = rex->lastparen;
6980    ST.lastcloseparen = rex->lastcloseparen;
6981    ST.next_branch = next;
6982    REGCP_SET(ST.cp);
6983
6984    /* Now go into the branch */
6985    if (has_cutgroup) {
6986     PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
6987    } else {
6988     PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
6989    }
6990    /* NOTREACHED */
6991    NOT_REACHED; /* NOTREACHED */
6992
6993   case CUTGROUP:  /*  /(*THEN)/  */
6994    sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
6995     MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6996    PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
6997    /* NOTREACHED */
6998    NOT_REACHED; /* NOTREACHED */
6999
7000   case CUTGROUP_next_fail:
7001    do_cutgroup = 1;
7002    no_final = 1;
7003    if (st->u.mark.mark_name)
7004     sv_commit = st->u.mark.mark_name;
7005    sayNO;
7006    /* NOTREACHED */
7007    NOT_REACHED; /* NOTREACHED */
7008
7009   case BRANCH_next:
7010    sayYES;
7011    /* NOTREACHED */
7012    NOT_REACHED; /* NOTREACHED */
7013
7014   case BRANCH_next_fail: /* that branch failed; try the next, if any */
7015    if (do_cutgroup) {
7016     do_cutgroup = 0;
7017     no_final = 0;
7018    }
7019    REGCP_UNWIND(ST.cp);
7020    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7021    scan = ST.next_branch;
7022    /* no more branches? */
7023    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7024     DEBUG_EXECUTE_r({
7025      PerlIO_printf( Perl_debug_log,
7026       "%*s  %sBRANCH failed...%s\n",
7027       REPORT_CODE_OFF+depth*2, "",
7028       PL_colors[4],
7029       PL_colors[5] );
7030     });
7031     sayNO_SILENT;
7032    }
7033    continue; /* execute next BRANCH[J] op */
7034    /* NOTREACHED */
7035
7036   case MINMOD: /* next op will be non-greedy, e.g. A*?  */
7037    minmod = 1;
7038    break;
7039
7040 #undef  ST
7041 #define ST st->u.curlym
7042
7043   case CURLYM: /* /A{m,n}B/ where A is fixed-length */
7044
7045    /* This is an optimisation of CURLYX that enables us to push
7046    * only a single backtracking state, no matter how many matches
7047    * there are in {m,n}. It relies on the pattern being constant
7048    * length, with no parens to influence future backrefs
7049    */
7050
7051    ST.me = scan;
7052    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7053
7054    ST.lastparen      = rex->lastparen;
7055    ST.lastcloseparen = rex->lastcloseparen;
7056
7057    /* if paren positive, emulate an OPEN/CLOSE around A */
7058    if (ST.me->flags) {
7059     U32 paren = ST.me->flags;
7060     if (paren > maxopenparen)
7061      maxopenparen = paren;
7062     scan += NEXT_OFF(scan); /* Skip former OPEN. */
7063    }
7064    ST.A = scan;
7065    ST.B = next;
7066    ST.alen = 0;
7067    ST.count = 0;
7068    ST.minmod = minmod;
7069    minmod = 0;
7070    ST.c1 = CHRTEST_UNINIT;
7071    REGCP_SET(ST.cp);
7072
7073    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7074     goto curlym_do_B;
7075
7076   curlym_do_A: /* execute the A in /A{m,n}B/  */
7077    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7078    /* NOTREACHED */
7079    NOT_REACHED; /* NOTREACHED */
7080
7081   case CURLYM_A: /* we've just matched an A */
7082    ST.count++;
7083    /* after first match, determine A's length: u.curlym.alen */
7084    if (ST.count == 1) {
7085     if (reginfo->is_utf8_target) {
7086      char *s = st->locinput;
7087      while (s < locinput) {
7088       ST.alen++;
7089       s += UTF8SKIP(s);
7090      }
7091     }
7092     else {
7093      ST.alen = locinput - st->locinput;
7094     }
7095     if (ST.alen == 0)
7096      ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7097    }
7098    DEBUG_EXECUTE_r(
7099     PerlIO_printf(Perl_debug_log,
7100       "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
7101       (int)(REPORT_CODE_OFF+(depth*2)), "",
7102       (IV) ST.count, (IV)ST.alen)
7103    );
7104
7105    if (cur_eval && cur_eval->u.eval.close_paren &&
7106     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
7107     goto fake_end;
7108
7109    {
7110     I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7111     if ( max == REG_INFTY || ST.count < max )
7112      goto curlym_do_A; /* try to match another A */
7113    }
7114    goto curlym_do_B; /* try to match B */
7115
7116   case CURLYM_A_fail: /* just failed to match an A */
7117    REGCP_UNWIND(ST.cp);
7118
7119    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7120     || (cur_eval && cur_eval->u.eval.close_paren &&
7121      cur_eval->u.eval.close_paren == (U32)ST.me->flags))
7122     sayNO;
7123
7124   curlym_do_B: /* execute the B in /A{m,n}B/  */
7125    if (ST.c1 == CHRTEST_UNINIT) {
7126     /* calculate c1 and c2 for possible match of 1st char
7127     * following curly */
7128     ST.c1 = ST.c2 = CHRTEST_VOID;
7129     assert(ST.B);
7130     if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7131      regnode *text_node = ST.B;
7132      if (! HAS_TEXT(text_node))
7133       FIND_NEXT_IMPT(text_node);
7134      /* this used to be
7135
7136       (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7137
7138        But the former is redundant in light of the latter.
7139
7140        if this changes back then the macro for
7141        IS_TEXT and friends need to change.
7142      */
7143      if (PL_regkind[OP(text_node)] == EXACT) {
7144       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7145       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7146       reginfo))
7147       {
7148        sayNO;
7149       }
7150      }
7151     }
7152    }
7153
7154    DEBUG_EXECUTE_r(
7155     PerlIO_printf(Perl_debug_log,
7156      "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
7157      (int)(REPORT_CODE_OFF+(depth*2)),
7158      "", (IV)ST.count)
7159     );
7160    if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7161     if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7162      if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7163       && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7164      {
7165       /* simulate B failing */
7166       DEBUG_OPTIMISE_r(
7167        PerlIO_printf(Perl_debug_log,
7168         "%*s  CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
7169         (int)(REPORT_CODE_OFF+(depth*2)),"",
7170         valid_utf8_to_uvchr((U8 *) locinput, NULL),
7171         valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7172         valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7173       );
7174       state_num = CURLYM_B_fail;
7175       goto reenter_switch;
7176      }
7177     }
7178     else if (nextchr != ST.c1 && nextchr != ST.c2) {
7179      /* simulate B failing */
7180      DEBUG_OPTIMISE_r(
7181       PerlIO_printf(Perl_debug_log,
7182        "%*s  CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7183        (int)(REPORT_CODE_OFF+(depth*2)),"",
7184        (int) nextchr, ST.c1, ST.c2)
7185      );
7186      state_num = CURLYM_B_fail;
7187      goto reenter_switch;
7188     }
7189    }
7190
7191    if (ST.me->flags) {
7192     /* emulate CLOSE: mark current A as captured */
7193     I32 paren = ST.me->flags;
7194     if (ST.count) {
7195      rex->offs[paren].start
7196       = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7197      rex->offs[paren].end = locinput - reginfo->strbeg;
7198      if ((U32)paren > rex->lastparen)
7199       rex->lastparen = paren;
7200      rex->lastcloseparen = paren;
7201     }
7202     else
7203      rex->offs[paren].end = -1;
7204     if (cur_eval && cur_eval->u.eval.close_paren &&
7205      cur_eval->u.eval.close_paren == (U32)ST.me->flags)
7206     {
7207      if (ST.count)
7208       goto fake_end;
7209      else
7210       sayNO;
7211     }
7212    }
7213
7214    PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7215    /* NOTREACHED */
7216    NOT_REACHED; /* NOTREACHED */
7217
7218   case CURLYM_B_fail: /* just failed to match a B */
7219    REGCP_UNWIND(ST.cp);
7220    UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7221    if (ST.minmod) {
7222     I32 max = ARG2(ST.me);
7223     if (max != REG_INFTY && ST.count == max)
7224      sayNO;
7225     goto curlym_do_A; /* try to match a further A */
7226    }
7227    /* backtrack one A */
7228    if (ST.count == ARG1(ST.me) /* min */)
7229     sayNO;
7230    ST.count--;
7231    SET_locinput(HOPc(locinput, -ST.alen));
7232    goto curlym_do_B; /* try to match B */
7233
7234 #undef ST
7235 #define ST st->u.curly
7236
7237 #define CURLY_SETPAREN(paren, success) \
7238  if (paren) { \
7239   if (success) { \
7240    rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7241    rex->offs[paren].end = locinput - reginfo->strbeg; \
7242    if (paren > rex->lastparen) \
7243     rex->lastparen = paren; \
7244    rex->lastcloseparen = paren; \
7245   } \
7246   else { \
7247    rex->offs[paren].end = -1; \
7248    rex->lastparen      = ST.lastparen; \
7249    rex->lastcloseparen = ST.lastcloseparen; \
7250   } \
7251  }
7252
7253   case STAR:  /*  /A*B/ where A is width 1 char */
7254    ST.paren = 0;
7255    ST.min = 0;
7256    ST.max = REG_INFTY;
7257    scan = NEXTOPER(scan);
7258    goto repeat;
7259
7260   case PLUS:  /*  /A+B/ where A is width 1 char */
7261    ST.paren = 0;
7262    ST.min = 1;
7263    ST.max = REG_INFTY;
7264    scan = NEXTOPER(scan);
7265    goto repeat;
7266
7267   case CURLYN:  /*  /(A){m,n}B/ where A is width 1 char */
7268    ST.paren = scan->flags; /* Which paren to set */
7269    ST.lastparen      = rex->lastparen;
7270    ST.lastcloseparen = rex->lastcloseparen;
7271    if (ST.paren > maxopenparen)
7272     maxopenparen = ST.paren;
7273    ST.min = ARG1(scan);  /* min to match */
7274    ST.max = ARG2(scan);  /* max to match */
7275    if (cur_eval && cur_eval->u.eval.close_paren &&
7276     cur_eval->u.eval.close_paren == (U32)ST.paren) {
7277     ST.min=1;
7278     ST.max=1;
7279    }
7280    scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7281    goto repeat;
7282
7283   case CURLY:  /*  /A{m,n}B/ where A is width 1 char */
7284    ST.paren = 0;
7285    ST.min = ARG1(scan);  /* min to match */
7286    ST.max = ARG2(scan);  /* max to match */
7287    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7288   repeat:
7289    /*
7290    * Lookahead to avoid useless match attempts
7291    * when we know what character comes next.
7292    *
7293    * Used to only do .*x and .*?x, but now it allows
7294    * for )'s, ('s and (?{ ... })'s to be in the way
7295    * of the quantifier and the EXACT-like node.  -- japhy
7296    */
7297
7298    assert(ST.min <= ST.max);
7299    if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7300     ST.c1 = ST.c2 = CHRTEST_VOID;
7301    }
7302    else {
7303     regnode *text_node = next;
7304
7305     if (! HAS_TEXT(text_node))
7306      FIND_NEXT_IMPT(text_node);
7307
7308     if (! HAS_TEXT(text_node))
7309      ST.c1 = ST.c2 = CHRTEST_VOID;
7310     else {
7311      if ( PL_regkind[OP(text_node)] != EXACT ) {
7312       ST.c1 = ST.c2 = CHRTEST_VOID;
7313      }
7314      else {
7315
7316      /*  Currently we only get here when
7317
7318       PL_rekind[OP(text_node)] == EXACT
7319
7320       if this changes back then the macro for IS_TEXT and
7321       friends need to change. */
7322       if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7323       text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7324       reginfo))
7325       {
7326        sayNO;
7327       }
7328      }
7329     }
7330    }
7331
7332    ST.A = scan;
7333    ST.B = next;
7334    if (minmod) {
7335     char *li = locinput;
7336     minmod = 0;
7337     if (ST.min &&
7338       regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
7339        < ST.min)
7340      sayNO;
7341     SET_locinput(li);
7342     ST.count = ST.min;
7343     REGCP_SET(ST.cp);
7344     if (ST.c1 == CHRTEST_VOID)
7345      goto curly_try_B_min;
7346
7347     ST.oldloc = locinput;
7348
7349     /* set ST.maxpos to the furthest point along the
7350     * string that could possibly match */
7351     if  (ST.max == REG_INFTY) {
7352      ST.maxpos = reginfo->strend - 1;
7353      if (utf8_target)
7354       while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
7355        ST.maxpos--;
7356     }
7357     else if (utf8_target) {
7358      int m = ST.max - ST.min;
7359      for (ST.maxpos = locinput;
7360       m >0 && ST.maxpos < reginfo->strend; m--)
7361       ST.maxpos += UTF8SKIP(ST.maxpos);
7362     }
7363     else {
7364      ST.maxpos = locinput + ST.max - ST.min;
7365      if (ST.maxpos >= reginfo->strend)
7366       ST.maxpos = reginfo->strend - 1;
7367     }
7368     goto curly_try_B_min_known;
7369
7370    }
7371    else {
7372     /* avoid taking address of locinput, so it can remain
7373     * a register var */
7374     char *li = locinput;
7375     ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
7376     if (ST.count < ST.min)
7377      sayNO;
7378     SET_locinput(li);
7379     if ((ST.count > ST.min)
7380      && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
7381     {
7382      /* A{m,n} must come at the end of the string, there's
7383      * no point in backing off ... */
7384      ST.min = ST.count;
7385      /* ...except that $ and \Z can match before *and* after
7386      newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
7387      We may back off by one in this case. */
7388      if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
7389       ST.min--;
7390     }
7391     REGCP_SET(ST.cp);
7392     goto curly_try_B_max;
7393    }
7394    /* NOTREACHED */
7395    NOT_REACHED; /* NOTREACHED */
7396
7397   case CURLY_B_min_known_fail:
7398    /* failed to find B in a non-greedy match where c1,c2 valid */
7399
7400    REGCP_UNWIND(ST.cp);
7401    if (ST.paren) {
7402     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7403    }
7404    /* Couldn't or didn't -- move forward. */
7405    ST.oldloc = locinput;
7406    if (utf8_target)
7407     locinput += UTF8SKIP(locinput);
7408    else
7409     locinput++;
7410    ST.count++;
7411   curly_try_B_min_known:
7412    /* find the next place where 'B' could work, then call B */
7413    {
7414     int n;
7415     if (utf8_target) {
7416      n = (ST.oldloc == locinput) ? 0 : 1;
7417      if (ST.c1 == ST.c2) {
7418       /* set n to utf8_distance(oldloc, locinput) */
7419       while (locinput <= ST.maxpos
7420        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
7421       {
7422        locinput += UTF8SKIP(locinput);
7423        n++;
7424       }
7425      }
7426      else {
7427       /* set n to utf8_distance(oldloc, locinput) */
7428       while (locinput <= ST.maxpos
7429        && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7430        && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7431       {
7432        locinput += UTF8SKIP(locinput);
7433        n++;
7434       }
7435      }
7436     }
7437     else {  /* Not utf8_target */
7438      if (ST.c1 == ST.c2) {
7439       while (locinput <= ST.maxpos &&
7440        UCHARAT(locinput) != ST.c1)
7441        locinput++;
7442      }
7443      else {
7444       while (locinput <= ST.maxpos
7445        && UCHARAT(locinput) != ST.c1
7446        && UCHARAT(locinput) != ST.c2)
7447        locinput++;
7448      }
7449      n = locinput - ST.oldloc;
7450     }
7451     if (locinput > ST.maxpos)
7452      sayNO;
7453     if (n) {
7454      /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
7455      * at b; check that everything between oldloc and
7456      * locinput matches */
7457      char *li = ST.oldloc;
7458      ST.count += n;
7459      if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
7460       sayNO;
7461      assert(n == REG_INFTY || locinput == li);
7462     }
7463     CURLY_SETPAREN(ST.paren, ST.count);
7464     if (cur_eval && cur_eval->u.eval.close_paren &&
7465      cur_eval->u.eval.close_paren == (U32)ST.paren) {
7466      goto fake_end;
7467     }
7468     PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
7469    }
7470    /* NOTREACHED */
7471    NOT_REACHED; /* NOTREACHED */
7472
7473   case CURLY_B_min_fail:
7474    /* failed to find B in a non-greedy match where c1,c2 invalid */
7475
7476    REGCP_UNWIND(ST.cp);
7477    if (ST.paren) {
7478     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7479    }
7480    /* failed -- move forward one */
7481    {
7482     char *li = locinput;
7483     if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
7484      sayNO;
7485     }
7486     locinput = li;
7487    }
7488    {
7489     ST.count++;
7490     if (ST.count <= ST.max || (ST.max == REG_INFTY &&
7491       ST.count > 0)) /* count overflow ? */
7492     {
7493     curly_try_B_min:
7494      CURLY_SETPAREN(ST.paren, ST.count);
7495      if (cur_eval && cur_eval->u.eval.close_paren &&
7496       cur_eval->u.eval.close_paren == (U32)ST.paren) {
7497       goto fake_end;
7498      }
7499      PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
7500     }
7501    }
7502    sayNO;
7503    /* NOTREACHED */
7504    NOT_REACHED; /* NOTREACHED */
7505
7506   curly_try_B_max:
7507    /* a successful greedy match: now try to match B */
7508    if (cur_eval && cur_eval->u.eval.close_paren &&
7509     cur_eval->u.eval.close_paren == (U32)ST.paren) {
7510     goto fake_end;
7511    }
7512    {
7513     bool could_match = locinput < reginfo->strend;
7514
7515     /* If it could work, try it. */
7516     if (ST.c1 != CHRTEST_VOID && could_match) {
7517      if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
7518      {
7519       could_match = memEQ(locinput,
7520            ST.c1_utf8,
7521            UTF8SKIP(locinput))
7522          || memEQ(locinput,
7523            ST.c2_utf8,
7524            UTF8SKIP(locinput));
7525      }
7526      else {
7527       could_match = UCHARAT(locinput) == ST.c1
7528          || UCHARAT(locinput) == ST.c2;
7529      }
7530     }
7531     if (ST.c1 == CHRTEST_VOID || could_match) {
7532      CURLY_SETPAREN(ST.paren, ST.count);
7533      PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
7534      /* NOTREACHED */
7535      NOT_REACHED; /* NOTREACHED */
7536     }
7537    }
7538    /* FALLTHROUGH */
7539
7540   case CURLY_B_max_fail:
7541    /* failed to find B in a greedy match */
7542
7543    REGCP_UNWIND(ST.cp);
7544    if (ST.paren) {
7545     UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7546    }
7547    /*  back up. */
7548    if (--ST.count < ST.min)
7549     sayNO;
7550    locinput = HOPc(locinput, -1);
7551    goto curly_try_B_max;
7552
7553 #undef ST
7554
7555   case END: /*  last op of main pattern  */
7556   fake_end:
7557    if (cur_eval) {
7558     /* we've just finished A in /(??{A})B/; now continue with B */
7559
7560     st->u.eval.prev_rex = rex_sv;  /* inner */
7561
7562     /* Save *all* the positions. */
7563     st->u.eval.cp = regcppush(rex, 0, maxopenparen);
7564     rex_sv = cur_eval->u.eval.prev_rex;
7565     is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7566     SET_reg_curpm(rex_sv);
7567     rex = ReANY(rex_sv);
7568     rexi = RXi_GET(rex);
7569     cur_curlyx = cur_eval->u.eval.prev_curlyx;
7570
7571     REGCP_SET(st->u.eval.lastcp);
7572
7573     /* Restore parens of the outer rex without popping the
7574     * savestack */
7575     S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
7576           &maxopenparen);
7577
7578     st->u.eval.prev_eval = cur_eval;
7579     cur_eval = cur_eval->u.eval.prev_eval;
7580     DEBUG_EXECUTE_r(
7581      PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
7582          REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
7583     if ( nochange_depth )
7584      nochange_depth--;
7585
7586     PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
7587          locinput); /* match B */
7588    }
7589
7590    if (locinput < reginfo->till) {
7591     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7592          "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
7593          PL_colors[4],
7594          (long)(locinput - startpos),
7595          (long)(reginfo->till - startpos),
7596          PL_colors[5]));
7597
7598     sayNO_SILENT;  /* Cannot match: too short. */
7599    }
7600    sayYES;   /* Success! */
7601
7602   case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
7603    DEBUG_EXECUTE_r(
7604    PerlIO_printf(Perl_debug_log,
7605     "%*s  %ssubpattern success...%s\n",
7606     REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
7607    sayYES;   /* Success! */
7608
7609 #undef  ST
7610 #define ST st->u.ifmatch
7611
7612   {
7613    char *newstart;
7614
7615   case SUSPEND: /* (?>A) */
7616    ST.wanted = 1;
7617    newstart = locinput;
7618    goto do_ifmatch;
7619
7620   case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
7621    ST.wanted = 0;
7622    goto ifmatch_trivial_fail_test;
7623
7624   case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
7625    ST.wanted = 1;
7626   ifmatch_trivial_fail_test:
7627    if (scan->flags) {
7628     char * const s = HOPBACKc(locinput, scan->flags);
7629     if (!s) {
7630      /* trivial fail */
7631      if (logical) {
7632       logical = 0;
7633       sw = 1 - cBOOL(ST.wanted);
7634      }
7635      else if (ST.wanted)
7636       sayNO;
7637      next = scan + ARG(scan);
7638      if (next == scan)
7639       next = NULL;
7640      break;
7641     }
7642     newstart = s;
7643    }
7644    else
7645     newstart = locinput;
7646
7647   do_ifmatch:
7648    ST.me = scan;
7649    ST.logical = logical;
7650    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
7651
7652    /* execute body of (?...A) */
7653    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
7654    /* NOTREACHED */
7655    NOT_REACHED; /* NOTREACHED */
7656   }
7657
7658   case IFMATCH_A_fail: /* body of (?...A) failed */
7659    ST.wanted = !ST.wanted;
7660    /* FALLTHROUGH */
7661
7662   case IFMATCH_A: /* body of (?...A) succeeded */
7663    if (ST.logical) {
7664     sw = cBOOL(ST.wanted);
7665    }
7666    else if (!ST.wanted)
7667     sayNO;
7668
7669    if (OP(ST.me) != SUSPEND) {
7670     /* restore old position except for (?>...) */
7671     locinput = st->locinput;
7672    }
7673    scan = ST.me + ARG(ST.me);
7674    if (scan == ST.me)
7675     scan = NULL;
7676    continue; /* execute B */
7677
7678 #undef ST
7679
7680   case LONGJMP: /*  alternative with many branches compiles to
7681      * (BRANCHJ; EXACT ...; LONGJMP ) x N */
7682    next = scan + ARG(scan);
7683    if (next == scan)
7684     next = NULL;
7685    break;
7686
7687   case COMMIT:  /*  (*COMMIT)  */
7688    reginfo->cutpoint = reginfo->strend;
7689    /* FALLTHROUGH */
7690
7691   case PRUNE:   /*  (*PRUNE)   */
7692    if (!scan->flags)
7693     sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7694    PUSH_STATE_GOTO(COMMIT_next, next, locinput);
7695    /* NOTREACHED */
7696    NOT_REACHED; /* NOTREACHED */
7697
7698   case COMMIT_next_fail:
7699    no_final = 1;
7700    /* FALLTHROUGH */
7701
7702   case OPFAIL:   /* (*FAIL)  */
7703    sayNO;
7704    /* NOTREACHED */
7705    NOT_REACHED; /* NOTREACHED */
7706
7707 #define ST st->u.mark
7708   case MARKPOINT: /*  (*MARK:foo)  */
7709    ST.prev_mark = mark_state;
7710    ST.mark_name = sv_commit = sv_yes_mark
7711     = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7712    mark_state = st;
7713    ST.mark_loc = locinput;
7714    PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
7715    /* NOTREACHED */
7716    NOT_REACHED; /* NOTREACHED */
7717
7718   case MARKPOINT_next:
7719    mark_state = ST.prev_mark;
7720    sayYES;
7721    /* NOTREACHED */
7722    NOT_REACHED; /* NOTREACHED */
7723
7724   case MARKPOINT_next_fail:
7725    if (popmark && sv_eq(ST.mark_name,popmark))
7726    {
7727     if (ST.mark_loc > startpoint)
7728      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
7729     popmark = NULL; /* we found our mark */
7730     sv_commit = ST.mark_name;
7731
7732     DEBUG_EXECUTE_r({
7733       PerlIO_printf(Perl_debug_log,
7734        "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
7735        REPORT_CODE_OFF+depth*2, "",
7736        PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
7737     });
7738    }
7739    mark_state = ST.prev_mark;
7740    sv_yes_mark = mark_state ?
7741     mark_state->u.mark.mark_name : NULL;
7742    sayNO;
7743    /* NOTREACHED */
7744    NOT_REACHED; /* NOTREACHED */
7745
7746   case SKIP:  /*  (*SKIP)  */
7747    if (scan->flags) {
7748     /* (*SKIP) : if we fail we cut here*/
7749     ST.mark_name = NULL;
7750     ST.mark_loc = locinput;
7751     PUSH_STATE_GOTO(SKIP_next,next, locinput);
7752    } else {
7753     /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
7754     otherwise do nothing.  Meaning we need to scan
7755     */
7756     regmatch_state *cur = mark_state;
7757     SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7758
7759     while (cur) {
7760      if ( sv_eq( cur->u.mark.mark_name,
7761         find ) )
7762      {
7763       ST.mark_name = find;
7764       PUSH_STATE_GOTO( SKIP_next, next, locinput);
7765      }
7766      cur = cur->u.mark.prev_mark;
7767     }
7768    }
7769    /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
7770    break;
7771
7772   case SKIP_next_fail:
7773    if (ST.mark_name) {
7774     /* (*CUT:NAME) - Set up to search for the name as we
7775     collapse the stack*/
7776     popmark = ST.mark_name;
7777    } else {
7778     /* (*CUT) - No name, we cut here.*/
7779     if (ST.mark_loc > startpoint)
7780      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
7781     /* but we set sv_commit to latest mark_name if there
7782     is one so they can test to see how things lead to this
7783     cut */
7784     if (mark_state)
7785      sv_commit=mark_state->u.mark.mark_name;
7786    }
7787    no_final = 1;
7788    sayNO;
7789    /* NOTREACHED */
7790    NOT_REACHED; /* NOTREACHED */
7791 #undef ST
7792
7793   case LNBREAK: /* \R */
7794    if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
7795     locinput += n;
7796    } else
7797     sayNO;
7798    break;
7799
7800   default:
7801    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
7802       PTR2UV(scan), OP(scan));
7803    Perl_croak(aTHX_ "regexp memory corruption");
7804
7805   /* this is a point to jump to in order to increment
7806   * locinput by one character */
7807   increment_locinput:
7808    assert(!NEXTCHR_IS_EOS);
7809    if (utf8_target) {
7810     locinput += PL_utf8skip[nextchr];
7811     /* locinput is allowed to go 1 char off the end, but not 2+ */
7812     if (locinput > reginfo->strend)
7813      sayNO;
7814    }
7815    else
7816     locinput++;
7817    break;
7818
7819   } /* end switch */
7820
7821   /* switch break jumps here */
7822   scan = next; /* prepare to execute the next op and ... */
7823   continue;    /* ... jump back to the top, reusing st */
7824   /* NOTREACHED */
7825
7826  push_yes_state:
7827   /* push a state that backtracks on success */
7828   st->u.yes.prev_yes_state = yes_state;
7829   yes_state = st;
7830   /* FALLTHROUGH */
7831  push_state:
7832   /* push a new regex state, then continue at scan  */
7833   {
7834    regmatch_state *newst;
7835
7836    DEBUG_STACK_r({
7837     regmatch_state *cur = st;
7838     regmatch_state *curyes = yes_state;
7839     int curd = depth;
7840     regmatch_slab *slab = PL_regmatch_slab;
7841     for (;curd > -1;cur--,curd--) {
7842      if (cur < SLAB_FIRST(slab)) {
7843       slab = slab->prev;
7844       cur = SLAB_LAST(slab);
7845      }
7846      PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
7847       REPORT_CODE_OFF + 2 + depth * 2,"",
7848       curd, PL_reg_name[cur->resume_state],
7849       (curyes == cur) ? "yes" : ""
7850      );
7851      if (curyes == cur)
7852       curyes = cur->u.yes.prev_yes_state;
7853     }
7854    } else
7855     DEBUG_STATE_pp("push")
7856    );
7857    depth++;
7858    st->locinput = locinput;
7859    newst = st+1;
7860    if (newst >  SLAB_LAST(PL_regmatch_slab))
7861     newst = S_push_slab(aTHX);
7862    PL_regmatch_state = newst;
7863
7864    locinput = pushinput;
7865    st = newst;
7866    continue;
7867    /* NOTREACHED */
7868   }
7869  }
7870
7871  /*
7872  * We get here only if there's trouble -- normally "case END" is
7873  * the terminating point.
7874  */
7875  Perl_croak(aTHX_ "corrupted regexp pointers");
7876  /* NOTREACHED */
7877  sayNO;
7878  NOT_REACHED; /* NOTREACHED */
7879
7880   yes:
7881  if (yes_state) {
7882   /* we have successfully completed a subexpression, but we must now
7883   * pop to the state marked by yes_state and continue from there */
7884   assert(st != yes_state);
7885 #ifdef DEBUGGING
7886   while (st != yes_state) {
7887    st--;
7888    if (st < SLAB_FIRST(PL_regmatch_slab)) {
7889     PL_regmatch_slab = PL_regmatch_slab->prev;
7890     st = SLAB_LAST(PL_regmatch_slab);
7891    }
7892    DEBUG_STATE_r({
7893     if (no_final) {
7894      DEBUG_STATE_pp("pop (no final)");
7895     } else {
7896      DEBUG_STATE_pp("pop (yes)");
7897     }
7898    });
7899    depth--;
7900   }
7901 #else
7902   while (yes_state < SLAB_FIRST(PL_regmatch_slab)
7903    || yes_state > SLAB_LAST(PL_regmatch_slab))
7904   {
7905    /* not in this slab, pop slab */
7906    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
7907    PL_regmatch_slab = PL_regmatch_slab->prev;
7908    st = SLAB_LAST(PL_regmatch_slab);
7909   }
7910   depth -= (st - yes_state);
7911 #endif
7912   st = yes_state;
7913   yes_state = st->u.yes.prev_yes_state;
7914   PL_regmatch_state = st;
7915
7916   if (no_final)
7917    locinput= st->locinput;
7918   state_num = st->resume_state + no_final;
7919   goto reenter_switch;
7920  }
7921
7922  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
7923       PL_colors[4], PL_colors[5]));
7924
7925  if (reginfo->info_aux_eval) {
7926   /* each successfully executed (?{...}) block does the equivalent of
7927   *   local $^R = do {...}
7928   * When popping the save stack, all these locals would be undone;
7929   * bypass this by setting the outermost saved $^R to the latest
7930   * value */
7931   /* I dont know if this is needed or works properly now.
7932   * see code related to PL_replgv elsewhere in this file.
7933   * Yves
7934   */
7935   if (oreplsv != GvSV(PL_replgv))
7936    sv_setsv(oreplsv, GvSV(PL_replgv));
7937  }
7938  result = 1;
7939  goto final_exit;
7940
7941   no:
7942  DEBUG_EXECUTE_r(
7943   PerlIO_printf(Perl_debug_log,
7944    "%*s  %sfailed...%s\n",
7945    REPORT_CODE_OFF+depth*2, "",
7946    PL_colors[4], PL_colors[5])
7947   );
7948
7949   no_silent:
7950  if (no_final) {
7951   if (yes_state) {
7952    goto yes;
7953   } else {
7954    goto final_exit;
7955   }
7956  }
7957  if (depth) {
7958   /* there's a previous state to backtrack to */
7959   st--;
7960   if (st < SLAB_FIRST(PL_regmatch_slab)) {
7961    PL_regmatch_slab = PL_regmatch_slab->prev;
7962    st = SLAB_LAST(PL_regmatch_slab);
7963   }
7964   PL_regmatch_state = st;
7965   locinput= st->locinput;
7966
7967   DEBUG_STATE_pp("pop");
7968   depth--;
7969   if (yes_state == st)
7970    yes_state = st->u.yes.prev_yes_state;
7971
7972   state_num = st->resume_state + 1; /* failure = success + 1 */
7973   goto reenter_switch;
7974  }
7975  result = 0;
7976
7977   final_exit:
7978  if (rex->intflags & PREGf_VERBARG_SEEN) {
7979   SV *sv_err = get_sv("REGERROR", 1);
7980   SV *sv_mrk = get_sv("REGMARK", 1);
7981   if (result) {
7982    sv_commit = &PL_sv_no;
7983    if (!sv_yes_mark)
7984     sv_yes_mark = &PL_sv_yes;
7985   } else {
7986    if (!sv_commit)
7987     sv_commit = &PL_sv_yes;
7988    sv_yes_mark = &PL_sv_no;
7989   }
7990   assert(sv_err);
7991   assert(sv_mrk);
7992   sv_setsv(sv_err, sv_commit);
7993   sv_setsv(sv_mrk, sv_yes_mark);
7994  }
7995
7996
7997  if (last_pushed_cv) {
7998   dSP;
7999   POP_MULTICALL;
8000   PERL_UNUSED_VAR(SP);
8001  }
8002
8003  assert(!result ||  locinput - reginfo->strbeg >= 0);
8004  return result ?  locinput - reginfo->strbeg : -1;
8005 }
8006
8007 /*
8008  - regrepeat - repeatedly match something simple, report how many
8009  *
8010  * What 'simple' means is a node which can be the operand of a quantifier like
8011  * '+', or {1,3}
8012  *
8013  * startposp - pointer a pointer to the start position.  This is updated
8014  *             to point to the byte following the highest successful
8015  *             match.
8016  * p         - the regnode to be repeatedly matched against.
8017  * reginfo   - struct holding match state, such as strend
8018  * max       - maximum number of things to match.
8019  * depth     - (for debugging) backtracking depth.
8020  */
8021 STATIC I32
8022 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8023    regmatch_info *const reginfo, I32 max, int depth)
8024 {
8025  char *scan;     /* Pointer to current position in target string */
8026  I32 c;
8027  char *loceol = reginfo->strend;   /* local version */
8028  I32 hardcount = 0;  /* How many matches so far */
8029  bool utf8_target = reginfo->is_utf8_target;
8030  unsigned int to_complement = 0;  /* Invert the result? */
8031  UV utf8_flags;
8032  _char_class_number classnum;
8033 #ifndef DEBUGGING
8034  PERL_UNUSED_ARG(depth);
8035 #endif
8036
8037  PERL_ARGS_ASSERT_REGREPEAT;
8038
8039  scan = *startposp;
8040  if (max == REG_INFTY)
8041   max = I32_MAX;
8042  else if (! utf8_target && loceol - scan > max)
8043   loceol = scan + max;
8044
8045  /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8046  * to the maximum of how far we should go in it (leaving it set to the real
8047  * end, if the maximum permissible would take us beyond that).  This allows
8048  * us to make the loop exit condition that we haven't gone past <loceol> to
8049  * also mean that we haven't exceeded the max permissible count, saving a
8050  * test each time through the loop.  But it assumes that the OP matches a
8051  * single byte, which is true for most of the OPs below when applied to a
8052  * non-UTF-8 target.  Those relatively few OPs that don't have this
8053  * characteristic will have to compensate.
8054  *
8055  * There is no adjustment for UTF-8 targets, as the number of bytes per
8056  * character varies.  OPs will have to test both that the count is less
8057  * than the max permissible (using <hardcount> to keep track), and that we
8058  * are still within the bounds of the string (using <loceol>.  A few OPs
8059  * match a single byte no matter what the encoding.  They can omit the max
8060  * test if, for the UTF-8 case, they do the adjustment that was skipped
8061  * above.
8062  *
8063  * Thus, the code above sets things up for the common case; and exceptional
8064  * cases need extra work; the common case is to make sure <scan> doesn't
8065  * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8066  * count doesn't exceed the maximum permissible */
8067
8068  switch (OP(p)) {
8069  case REG_ANY:
8070   if (utf8_target) {
8071    while (scan < loceol && hardcount < max && *scan != '\n') {
8072     scan += UTF8SKIP(scan);
8073     hardcount++;
8074    }
8075   } else {
8076    while (scan < loceol && *scan != '\n')
8077     scan++;
8078   }
8079   break;
8080  case SANY:
8081   if (utf8_target) {
8082    while (scan < loceol && hardcount < max) {
8083     scan += UTF8SKIP(scan);
8084     hardcount++;
8085    }
8086   }
8087   else
8088    scan = loceol;
8089   break;
8090  case EXACTL:
8091   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8092   if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8093    _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8094   }
8095   /* FALLTHROUGH */
8096  case EXACT:
8097   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8098
8099   c = (U8)*STRING(p);
8100
8101   /* Can use a simple loop if the pattern char to match on is invariant
8102   * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
8103   * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8104   * true iff it doesn't matter if the argument is in UTF-8 or not */
8105   if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8106    if (utf8_target && loceol - scan > max) {
8107     /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8108     * since here, to match at all, 1 char == 1 byte */
8109     loceol = scan + max;
8110    }
8111    while (scan < loceol && UCHARAT(scan) == c) {
8112     scan++;
8113    }
8114   }
8115   else if (reginfo->is_utf8_pat) {
8116    if (utf8_target) {
8117     STRLEN scan_char_len;
8118
8119     /* When both target and pattern are UTF-8, we have to do
8120     * string EQ */
8121     while (hardcount < max
8122      && scan < loceol
8123      && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8124      && memEQ(scan, STRING(p), scan_char_len))
8125     {
8126      scan += scan_char_len;
8127      hardcount++;
8128     }
8129    }
8130    else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8131
8132     /* Target isn't utf8; convert the character in the UTF-8
8133     * pattern to non-UTF8, and do a simple loop */
8134     c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8135     while (scan < loceol && UCHARAT(scan) == c) {
8136      scan++;
8137     }
8138    } /* else pattern char is above Latin1, can't possibly match the
8139     non-UTF-8 target */
8140   }
8141   else {
8142
8143    /* Here, the string must be utf8; pattern isn't, and <c> is
8144    * different in utf8 than not, so can't compare them directly.
8145    * Outside the loop, find the two utf8 bytes that represent c, and
8146    * then look for those in sequence in the utf8 string */
8147    U8 high = UTF8_TWO_BYTE_HI(c);
8148    U8 low = UTF8_TWO_BYTE_LO(c);
8149
8150    while (hardcount < max
8151      && scan + 1 < loceol
8152      && UCHARAT(scan) == high
8153      && UCHARAT(scan + 1) == low)
8154    {
8155     scan += 2;
8156     hardcount++;
8157    }
8158   }
8159   break;
8160
8161  case EXACTFA_NO_TRIE:   /* This node only generated for non-utf8 patterns */
8162   assert(! reginfo->is_utf8_pat);
8163   /* FALLTHROUGH */
8164  case EXACTFA:
8165   utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8166   goto do_exactf;
8167
8168  case EXACTFL:
8169   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8170   utf8_flags = FOLDEQ_LOCALE;
8171   goto do_exactf;
8172
8173  case EXACTF:   /* This node only generated for non-utf8 patterns */
8174   assert(! reginfo->is_utf8_pat);
8175   utf8_flags = 0;
8176   goto do_exactf;
8177
8178  case EXACTFLU8:
8179   if (! utf8_target) {
8180    break;
8181   }
8182   utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8183          | FOLDEQ_S2_FOLDS_SANE;
8184   goto do_exactf;
8185
8186  case EXACTFU_SS:
8187  case EXACTFU:
8188   utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8189
8190  do_exactf: {
8191   int c1, c2;
8192   U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8193
8194   assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8195
8196   if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8197           reginfo))
8198   {
8199    if (c1 == CHRTEST_VOID) {
8200     /* Use full Unicode fold matching */
8201     char *tmpeol = reginfo->strend;
8202     STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8203     while (hardcount < max
8204       && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8205            STRING(p), NULL, pat_len,
8206            reginfo->is_utf8_pat, utf8_flags))
8207     {
8208      scan = tmpeol;
8209      tmpeol = reginfo->strend;
8210      hardcount++;
8211     }
8212    }
8213    else if (utf8_target) {
8214     if (c1 == c2) {
8215      while (scan < loceol
8216       && hardcount < max
8217       && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8218      {
8219       scan += UTF8SKIP(scan);
8220       hardcount++;
8221      }
8222     }
8223     else {
8224      while (scan < loceol
8225       && hardcount < max
8226       && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8227        || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8228      {
8229       scan += UTF8SKIP(scan);
8230       hardcount++;
8231      }
8232     }
8233    }
8234    else if (c1 == c2) {
8235     while (scan < loceol && UCHARAT(scan) == c1) {
8236      scan++;
8237     }
8238    }
8239    else {
8240     while (scan < loceol &&
8241      (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8242     {
8243      scan++;
8244     }
8245    }
8246   }
8247   break;
8248  }
8249  case ANYOFL:
8250   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8251   /* FALLTHROUGH */
8252  case ANYOF:
8253   if (utf8_target) {
8254    while (hardcount < max
8255     && scan < loceol
8256     && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8257    {
8258     scan += UTF8SKIP(scan);
8259     hardcount++;
8260    }
8261   } else {
8262    while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
8263     scan++;
8264   }
8265   break;
8266
8267  /* The argument (FLAGS) to all the POSIX node types is the class number */
8268
8269  case NPOSIXL:
8270   to_complement = 1;
8271   /* FALLTHROUGH */
8272
8273  case POSIXL:
8274   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8275   if (! utf8_target) {
8276    while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8277                 *scan)))
8278    {
8279     scan++;
8280    }
8281   } else {
8282    while (hardcount < max && scan < loceol
8283     && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8284                 (U8 *) scan)))
8285    {
8286     scan += UTF8SKIP(scan);
8287     hardcount++;
8288    }
8289   }
8290   break;
8291
8292  case POSIXD:
8293   if (utf8_target) {
8294    goto utf8_posix;
8295   }
8296   /* FALLTHROUGH */
8297
8298  case POSIXA:
8299   if (utf8_target && loceol - scan > max) {
8300
8301    /* We didn't adjust <loceol> at the beginning of this routine
8302    * because is UTF-8, but it is actually ok to do so, since here, to
8303    * match, 1 char == 1 byte. */
8304    loceol = scan + max;
8305   }
8306   while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
8307    scan++;
8308   }
8309   break;
8310
8311  case NPOSIXD:
8312   if (utf8_target) {
8313    to_complement = 1;
8314    goto utf8_posix;
8315   }
8316   /* FALLTHROUGH */
8317
8318  case NPOSIXA:
8319   if (! utf8_target) {
8320    while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
8321     scan++;
8322    }
8323   }
8324   else {
8325
8326    /* The complement of something that matches only ASCII matches all
8327    * non-ASCII, plus everything in ASCII that isn't in the class. */
8328    while (hardcount < max && scan < loceol
8329     && (! isASCII_utf8(scan)
8330      || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
8331    {
8332     scan += UTF8SKIP(scan);
8333     hardcount++;
8334    }
8335   }
8336   break;
8337
8338  case NPOSIXU:
8339   to_complement = 1;
8340   /* FALLTHROUGH */
8341
8342  case POSIXU:
8343   if (! utf8_target) {
8344    while (scan < loceol && to_complement
8345         ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
8346    {
8347     scan++;
8348    }
8349   }
8350   else {
8351   utf8_posix:
8352    classnum = (_char_class_number) FLAGS(p);
8353    if (classnum < _FIRST_NON_SWASH_CC) {
8354
8355     /* Here, a swash is needed for above-Latin1 code points.
8356     * Process as many Latin1 code points using the built-in rules.
8357     * Go to another loop to finish processing upon encountering
8358     * the first Latin1 code point.  We could do that in this loop
8359     * as well, but the other way saves having to test if the swash
8360     * has been loaded every time through the loop: extra space to
8361     * save a test. */
8362     while (hardcount < max && scan < loceol) {
8363      if (UTF8_IS_INVARIANT(*scan)) {
8364       if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
8365                 classnum))))
8366       {
8367        break;
8368       }
8369       scan++;
8370      }
8371      else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
8372       if (! (to_complement
8373        ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
8374                  *(scan + 1)),
8375              classnum))))
8376       {
8377        break;
8378       }
8379       scan += 2;
8380      }
8381      else {
8382       goto found_above_latin1;
8383      }
8384
8385      hardcount++;
8386     }
8387    }
8388    else {
8389     /* For these character classes, the knowledge of how to handle
8390     * every code point is compiled in to Perl via a macro.  This
8391     * code is written for making the loops as tight as possible.
8392     * It could be refactored to save space instead */
8393     switch (classnum) {
8394      case _CC_ENUM_SPACE:
8395       while (hardcount < max
8396        && scan < loceol
8397        && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
8398       {
8399        scan += UTF8SKIP(scan);
8400        hardcount++;
8401       }
8402       break;
8403      case _CC_ENUM_BLANK:
8404       while (hardcount < max
8405        && scan < loceol
8406        && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
8407       {
8408        scan += UTF8SKIP(scan);
8409        hardcount++;
8410       }
8411       break;
8412      case _CC_ENUM_XDIGIT:
8413       while (hardcount < max
8414        && scan < loceol
8415        && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
8416       {
8417        scan += UTF8SKIP(scan);
8418        hardcount++;
8419       }
8420       break;
8421      case _CC_ENUM_VERTSPACE:
8422       while (hardcount < max
8423        && scan < loceol
8424        && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
8425       {
8426        scan += UTF8SKIP(scan);
8427        hardcount++;
8428       }
8429       break;
8430      case _CC_ENUM_CNTRL:
8431       while (hardcount < max
8432        && scan < loceol
8433        && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
8434       {
8435        scan += UTF8SKIP(scan);
8436        hardcount++;
8437       }
8438       break;
8439      default:
8440       Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
8441     }
8442    }
8443   }
8444   break;
8445
8446  found_above_latin1:   /* Continuation of POSIXU and NPOSIXU */
8447
8448   /* Load the swash if not already present */
8449   if (! PL_utf8_swash_ptrs[classnum]) {
8450    U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
8451    PL_utf8_swash_ptrs[classnum] = _core_swash_init(
8452           "utf8",
8453           "",
8454           &PL_sv_undef, 1, 0,
8455           PL_XPosix_ptrs[classnum], &flags);
8456   }
8457
8458   while (hardcount < max && scan < loceol
8459    && to_complement ^ cBOOL(_generic_utf8(
8460          classnum,
8461          scan,
8462          swash_fetch(PL_utf8_swash_ptrs[classnum],
8463             (U8 *) scan,
8464             TRUE))))
8465   {
8466    scan += UTF8SKIP(scan);
8467    hardcount++;
8468   }
8469   break;
8470
8471  case LNBREAK:
8472   if (utf8_target) {
8473    while (hardcount < max && scan < loceol &&
8474      (c=is_LNBREAK_utf8_safe(scan, loceol))) {
8475     scan += c;
8476     hardcount++;
8477    }
8478   } else {
8479    /* LNBREAK can match one or two latin chars, which is ok, but we
8480    * have to use hardcount in this situation, and throw away the
8481    * adjustment to <loceol> done before the switch statement */
8482    loceol = reginfo->strend;
8483    while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
8484     scan+=c;
8485     hardcount++;
8486    }
8487   }
8488   break;
8489
8490  case BOUNDL:
8491  case NBOUNDL:
8492   _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8493   /* FALLTHROUGH */
8494  case BOUND:
8495  case BOUNDA:
8496  case BOUNDU:
8497  case EOS:
8498  case GPOS:
8499  case KEEPS:
8500  case NBOUND:
8501  case NBOUNDA:
8502  case NBOUNDU:
8503  case OPFAIL:
8504  case SBOL:
8505  case SEOL:
8506   /* These are all 0 width, so match right here or not at all. */
8507   break;
8508
8509  default:
8510   Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
8511   /* NOTREACHED */
8512   NOT_REACHED; /* NOTREACHED */
8513
8514  }
8515
8516  if (hardcount)
8517   c = hardcount;
8518  else
8519   c = scan - *startposp;
8520  *startposp = scan;
8521
8522  DEBUG_r({
8523   GET_RE_DEBUG_FLAGS_DECL;
8524   DEBUG_EXECUTE_r({
8525    SV * const prop = sv_newmortal();
8526    regprop(prog, prop, p, reginfo, NULL);
8527    PerlIO_printf(Perl_debug_log,
8528       "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
8529       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
8530   });
8531  });
8532
8533  return(c);
8534 }
8535
8536
8537 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
8538 /*
8539 - regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
8540 create a copy so that changes the caller makes won't change the shared one.
8541 If <altsvp> is non-null, will return NULL in it, for back-compat.
8542  */
8543 SV *
8544 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
8545 {
8546  PERL_ARGS_ASSERT_REGCLASS_SWASH;
8547
8548  if (altsvp) {
8549   *altsvp = NULL;
8550  }
8551
8552  return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
8553 }
8554
8555 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
8556
8557 /*
8558  - reginclass - determine if a character falls into a character class
8559
8560   n is the ANYOF-type regnode
8561   p is the target string
8562   p_end points to one byte beyond the end of the target string
8563   utf8_target tells whether p is in UTF-8.
8564
8565   Returns true if matched; false otherwise.
8566
8567   Note that this can be a synthetic start class, a combination of various
8568   nodes, so things you think might be mutually exclusive, such as locale,
8569   aren't.  It can match both locale and non-locale
8570
8571  */
8572
8573 STATIC bool
8574 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
8575 {
8576  dVAR;
8577  const char flags = ANYOF_FLAGS(n);
8578  bool match = FALSE;
8579  UV c = *p;
8580
8581  PERL_ARGS_ASSERT_REGINCLASS;
8582
8583  /* If c is not already the code point, get it.  Note that
8584  * UTF8_IS_INVARIANT() works even if not in UTF-8 */
8585  if (! UTF8_IS_INVARIANT(c) && utf8_target) {
8586   STRLEN c_len = 0;
8587   c = utf8n_to_uvchr(p, p_end - p, &c_len,
8588     (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
8589     | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
8590     /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
8591     * UTF8_ALLOW_FFFF */
8592   if (c_len == (STRLEN)-1)
8593    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
8594   if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
8595    _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
8596   }
8597  }
8598
8599  /* If this character is potentially in the bitmap, check it */
8600  if (c < NUM_ANYOF_CODE_POINTS) {
8601   if (ANYOF_BITMAP_TEST(n, c))
8602    match = TRUE;
8603   else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
8604     && ! utf8_target
8605     && ! isASCII(c))
8606   {
8607    match = TRUE;
8608   }
8609   else if (flags & ANYOF_LOCALE_FLAGS) {
8610    if ((flags & ANYOF_LOC_FOLD)
8611     && c < 256
8612     && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
8613    {
8614     match = TRUE;
8615    }
8616    else if (ANYOF_POSIXL_TEST_ANY_SET(n)
8617      && c < 256
8618    ) {
8619
8620     /* The data structure is arranged so bits 0, 2, 4, ... are set
8621     * if the class includes the Posix character class given by
8622     * bit/2; and 1, 3, 5, ... are set if the class includes the
8623     * complemented Posix class given by int(bit/2).  So we loop
8624     * through the bits, each time changing whether we complement
8625     * the result or not.  Suppose for the sake of illustration
8626     * that bits 0-3 mean respectively, \w, \W, \s, \S.  If bit 0
8627     * is set, it means there is a match for this ANYOF node if the
8628     * character is in the class given by the expression (0 / 2 = 0
8629     * = \w).  If it is in that class, isFOO_lc() will return 1,
8630     * and since 'to_complement' is 0, the result will stay TRUE,
8631     * and we exit the loop.  Suppose instead that bit 0 is 0, but
8632     * bit 1 is 1.  That means there is a match if the character
8633     * matches \W.  We won't bother to call isFOO_lc() on bit 0,
8634     * but will on bit 1.  On the second iteration 'to_complement'
8635     * will be 1, so the exclusive or will reverse things, so we
8636     * are testing for \W.  On the third iteration, 'to_complement'
8637     * will be 0, and we would be testing for \s; the fourth
8638     * iteration would test for \S, etc.
8639     *
8640     * Note that this code assumes that all the classes are closed
8641     * under folding.  For example, if a character matches \w, then
8642     * its fold does too; and vice versa.  This should be true for
8643     * any well-behaved locale for all the currently defined Posix
8644     * classes, except for :lower: and :upper:, which are handled
8645     * by the pseudo-class :cased: which matches if either of the
8646     * other two does.  To get rid of this assumption, an outer
8647     * loop could be used below to iterate over both the source
8648     * character, and its fold (if different) */
8649
8650     int count = 0;
8651     int to_complement = 0;
8652
8653     while (count < ANYOF_MAX) {
8654      if (ANYOF_POSIXL_TEST(n, count)
8655       && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
8656      {
8657       match = TRUE;
8658       break;
8659      }
8660      count++;
8661      to_complement ^= 1;
8662     }
8663    }
8664   }
8665  }
8666
8667
8668  /* If the bitmap didn't (or couldn't) match, and something outside the
8669  * bitmap could match, try that. */
8670  if (!match) {
8671   if (c >= NUM_ANYOF_CODE_POINTS
8672    && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
8673   {
8674    match = TRUE; /* Everything above the bitmap matches */
8675   }
8676   else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
8677     || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
8678     || ((flags & ANYOF_LOC_FOLD)
8679      && IN_UTF8_CTYPE_LOCALE
8680      && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
8681   {
8682    SV* only_utf8_locale = NULL;
8683    SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
8684              &only_utf8_locale, NULL);
8685    if (sw) {
8686     U8 utf8_buffer[2];
8687     U8 * utf8_p;
8688     if (utf8_target) {
8689      utf8_p = (U8 *) p;
8690     } else { /* Convert to utf8 */
8691      utf8_p = utf8_buffer;
8692      append_utf8_from_native_byte(*p, &utf8_p);
8693      utf8_p = utf8_buffer;
8694     }
8695
8696     if (swash_fetch(sw, utf8_p, TRUE)) {
8697      match = TRUE;
8698     }
8699    }
8700    if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
8701     match = _invlist_contains_cp(only_utf8_locale, c);
8702    }
8703   }
8704
8705   if (UNICODE_IS_SUPER(c)
8706    && (flags & ANYOF_WARN_SUPER)
8707    && ckWARN_d(WARN_NON_UNICODE))
8708   {
8709    Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
8710     "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
8711   }
8712  }
8713
8714 #if ANYOF_INVERT != 1
8715  /* Depending on compiler optimization cBOOL takes time, so if don't have to
8716  * use it, don't */
8717 #   error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
8718 #endif
8719
8720  /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
8721  return (flags & ANYOF_INVERT) ^ match;
8722 }
8723
8724 STATIC U8 *
8725 S_reghop3(U8 *s, SSize_t off, const U8* lim)
8726 {
8727  /* return the position 'off' UTF-8 characters away from 's', forward if
8728  * 'off' >= 0, backwards if negative.  But don't go outside of position
8729  * 'lim', which better be < s  if off < 0 */
8730
8731  PERL_ARGS_ASSERT_REGHOP3;
8732
8733  if (off >= 0) {
8734   while (off-- && s < lim) {
8735    /* XXX could check well-formedness here */
8736    s += UTF8SKIP(s);
8737   }
8738  }
8739  else {
8740   while (off++ && s > lim) {
8741    s--;
8742    if (UTF8_IS_CONTINUED(*s)) {
8743     while (s > lim && UTF8_IS_CONTINUATION(*s))
8744      s--;
8745    }
8746    /* XXX could check well-formedness here */
8747   }
8748  }
8749  return s;
8750 }
8751
8752 STATIC U8 *
8753 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
8754 {
8755  PERL_ARGS_ASSERT_REGHOP4;
8756
8757  if (off >= 0) {
8758   while (off-- && s < rlim) {
8759    /* XXX could check well-formedness here */
8760    s += UTF8SKIP(s);
8761   }
8762  }
8763  else {
8764   while (off++ && s > llim) {
8765    s--;
8766    if (UTF8_IS_CONTINUED(*s)) {
8767     while (s > llim && UTF8_IS_CONTINUATION(*s))
8768      s--;
8769    }
8770    /* XXX could check well-formedness here */
8771   }
8772  }
8773  return s;
8774 }
8775
8776 /* like reghop3, but returns NULL on overrun, rather than returning last
8777  * char pos */
8778
8779 STATIC U8 *
8780 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
8781 {
8782  PERL_ARGS_ASSERT_REGHOPMAYBE3;
8783
8784  if (off >= 0) {
8785   while (off-- && s < lim) {
8786    /* XXX could check well-formedness here */
8787    s += UTF8SKIP(s);
8788   }
8789   if (off >= 0)
8790    return NULL;
8791  }
8792  else {
8793   while (off++ && s > lim) {
8794    s--;
8795    if (UTF8_IS_CONTINUED(*s)) {
8796     while (s > lim && UTF8_IS_CONTINUATION(*s))
8797      s--;
8798    }
8799    /* XXX could check well-formedness here */
8800   }
8801   if (off <= 0)
8802    return NULL;
8803  }
8804  return s;
8805 }
8806
8807
8808 /* when executing a regex that may have (?{}), extra stuff needs setting
8809    up that will be visible to the called code, even before the current
8810    match has finished. In particular:
8811
8812    * $_ is localised to the SV currently being matched;
8813    * pos($_) is created if necessary, ready to be updated on each call-out
8814  to code;
8815    * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
8816  isn't set until the current pattern is successfully finished), so that
8817  $1 etc of the match-so-far can be seen;
8818    * save the old values of subbeg etc of the current regex, and  set then
8819  to the current string (again, this is normally only done at the end
8820  of execution)
8821 */
8822
8823 static void
8824 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
8825 {
8826  MAGIC *mg;
8827  regexp *const rex = ReANY(reginfo->prog);
8828  regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8829
8830  eval_state->rex = rex;
8831
8832  if (reginfo->sv) {
8833   /* Make $_ available to executed code. */
8834   if (reginfo->sv != DEFSV) {
8835    SAVE_DEFSV;
8836    DEFSV_set(reginfo->sv);
8837   }
8838
8839   if (!(mg = mg_find_mglob(reginfo->sv))) {
8840    /* prepare for quick setting of pos */
8841    mg = sv_magicext_mglob(reginfo->sv);
8842    mg->mg_len = -1;
8843   }
8844   eval_state->pos_magic = mg;
8845   eval_state->pos       = mg->mg_len;
8846   eval_state->pos_flags = mg->mg_flags;
8847  }
8848  else
8849   eval_state->pos_magic = NULL;
8850
8851  if (!PL_reg_curpm) {
8852   /* PL_reg_curpm is a fake PMOP that we can attach the current
8853   * regex to and point PL_curpm at, so that $1 et al are visible
8854   * within a /(?{})/. It's just allocated once per interpreter the
8855   * first time its needed */
8856   Newxz(PL_reg_curpm, 1, PMOP);
8857 #ifdef USE_ITHREADS
8858   {
8859    SV* const repointer = &PL_sv_undef;
8860    /* this regexp is also owned by the new PL_reg_curpm, which
8861    will try to free it.  */
8862    av_push(PL_regex_padav, repointer);
8863    PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
8864    PL_regex_pad = AvARRAY(PL_regex_padav);
8865   }
8866 #endif
8867  }
8868  SET_reg_curpm(reginfo->prog);
8869  eval_state->curpm = PL_curpm;
8870  PL_curpm = PL_reg_curpm;
8871  if (RXp_MATCH_COPIED(rex)) {
8872   /*  Here is a serious problem: we cannot rewrite subbeg,
8873    since it may be needed if this match fails.  Thus
8874    $` inside (?{}) could fail... */
8875   eval_state->subbeg     = rex->subbeg;
8876   eval_state->sublen     = rex->sublen;
8877   eval_state->suboffset  = rex->suboffset;
8878   eval_state->subcoffset = rex->subcoffset;
8879 #ifdef PERL_ANY_COW
8880   eval_state->saved_copy = rex->saved_copy;
8881 #endif
8882   RXp_MATCH_COPIED_off(rex);
8883  }
8884  else
8885   eval_state->subbeg = NULL;
8886  rex->subbeg = (char *)reginfo->strbeg;
8887  rex->suboffset = 0;
8888  rex->subcoffset = 0;
8889  rex->sublen = reginfo->strend - reginfo->strbeg;
8890 }
8891
8892
8893 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
8894
8895 static void
8896 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
8897 {
8898  regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8899  regmatch_info_aux_eval *eval_state =  aux->info_aux_eval;
8900  regmatch_slab *s;
8901
8902  Safefree(aux->poscache);
8903
8904  if (eval_state) {
8905
8906   /* undo the effects of S_setup_eval_state() */
8907
8908   if (eval_state->subbeg) {
8909    regexp * const rex = eval_state->rex;
8910    rex->subbeg     = eval_state->subbeg;
8911    rex->sublen     = eval_state->sublen;
8912    rex->suboffset  = eval_state->suboffset;
8913    rex->subcoffset = eval_state->subcoffset;
8914 #ifdef PERL_ANY_COW
8915    rex->saved_copy = eval_state->saved_copy;
8916 #endif
8917    RXp_MATCH_COPIED_on(rex);
8918   }
8919   if (eval_state->pos_magic)
8920   {
8921    eval_state->pos_magic->mg_len = eval_state->pos;
8922    eval_state->pos_magic->mg_flags =
8923     (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8924    | (eval_state->pos_flags & MGf_BYTES);
8925   }
8926
8927   PL_curpm = eval_state->curpm;
8928  }
8929
8930  PL_regmatch_state = aux->old_regmatch_state;
8931  PL_regmatch_slab  = aux->old_regmatch_slab;
8932
8933  /* free all slabs above current one - this must be the last action
8934  * of this function, as aux and eval_state are allocated within
8935  * slabs and may be freed here */
8936
8937  s = PL_regmatch_slab->next;
8938  if (s) {
8939   PL_regmatch_slab->next = NULL;
8940   while (s) {
8941    regmatch_slab * const osl = s;
8942    s = s->next;
8943    Safefree(osl);
8944   }
8945  }
8946 }
8947
8948
8949 STATIC void
8950 S_to_utf8_substr(pTHX_ regexp *prog)
8951 {
8952  /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8953  * on the converted value */
8954
8955  int i = 1;
8956
8957  PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8958
8959  do {
8960   if (prog->substrs->data[i].substr
8961    && !prog->substrs->data[i].utf8_substr) {
8962    SV* const sv = newSVsv(prog->substrs->data[i].substr);
8963    prog->substrs->data[i].utf8_substr = sv;
8964    sv_utf8_upgrade(sv);
8965    if (SvVALID(prog->substrs->data[i].substr)) {
8966     if (SvTAIL(prog->substrs->data[i].substr)) {
8967      /* Trim the trailing \n that fbm_compile added last
8968      time.  */
8969      SvCUR_set(sv, SvCUR(sv) - 1);
8970      /* Whilst this makes the SV technically "invalid" (as its
8971      buffer is no longer followed by "\0") when fbm_compile()
8972      adds the "\n" back, a "\0" is restored.  */
8973      fbm_compile(sv, FBMcf_TAIL);
8974     } else
8975      fbm_compile(sv, 0);
8976    }
8977    if (prog->substrs->data[i].substr == prog->check_substr)
8978     prog->check_utf8 = sv;
8979   }
8980  } while (i--);
8981 }
8982
8983 STATIC bool
8984 S_to_byte_substr(pTHX_ regexp *prog)
8985 {
8986  /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8987  * on the converted value; returns FALSE if can't be converted. */
8988
8989  int i = 1;
8990
8991  PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8992
8993  do {
8994   if (prog->substrs->data[i].utf8_substr
8995    && !prog->substrs->data[i].substr) {
8996    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
8997    if (! sv_utf8_downgrade(sv, TRUE)) {
8998     return FALSE;
8999    }
9000    if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9001     if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9002      /* Trim the trailing \n that fbm_compile added last
9003       time.  */
9004      SvCUR_set(sv, SvCUR(sv) - 1);
9005      fbm_compile(sv, FBMcf_TAIL);
9006     } else
9007      fbm_compile(sv, 0);
9008    }
9009    prog->substrs->data[i].substr = sv;
9010    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9011     prog->check_substr = sv;
9012   }
9013  } while (i--);
9014
9015  return TRUE;
9016 }
9017
9018 /*
9019  * ex: set ts=8 sts=4 sw=4 et:
9020  */