src/5012000/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *  One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  * Copyright (c) 1986 by University of Toronto.
  44  * Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  * Permission is granted to anyone to use this software for any
  47  * purpose on any computer system, and to redistribute it freely,
  48  * subject to the following restrictions:
  49  *
  50  * 1. The author is not responsible for the consequences of use of
  51  *  this software, no matter how awful, even if they arise
  52  *  from defects in it.
  53  *
  54  * 2. The origin of this software must not be misrepresented, either
  55  *  by explicit claim or by omission.
  56  *
  57  * 3. Altered versions must be plainly marked as such, and must not
  58  *  be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #undef PERL_IN_XSUB_RE
  76 #define PERL_IN_XSUB_RE 1
  77 #include "perl.h"
  78 #include "re_defs.h"
  79 #undef PERL_IN_XSUB_RE
  80
  81 #ifdef PERL_IN_XSUB_RE
  82 #  include "re_comp.h"
  83 #else
  84 #  include "regcomp.h"
  85 #endif
  86
  87 #define RF_tainted 1  /* tainted information used? */
  88 #define RF_warned 2  /* warned about big count? */
  89
  90 #define RF_utf8  8  /* Pattern contains multibyte chars? */
  91
  92 #define UTF ((PL_reg_flags & RF_utf8) != 0)
  93
  94 #define RS_init  1  /* eval environment created */
  95 #define RS_set  2  /* replsv value is set */
  96
  97 #ifndef STATIC
  98 #define STATIC static
  99 #endif
 100
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111   (char *)(PL_reg_match_utf8 \
 112    ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113    : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115   (char*)(PL_reg_match_utf8\
 116    ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117    : (pos - off >= PL_bostr)  \
 118     ? (U8*)pos - off  \
 119     : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126  if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130  if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137   LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138   LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139   /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140    * assert should likely and hopefully fail on an EBCDIC machine */ \
 141   LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                    \
 143   /* No asserts are done for these, in case called on an early        \
 144    * Unicode version in which they map to nothing */               \
 145   LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146   LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);     /* U+1100 "\xe1\x84\x80" */ \
 147   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150   LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151   LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 /*
 154    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 155    so that it is possible to override the option here without having to
 156    rebuild the entire core. as we are required to do if we change regcomp.h
 157    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 158 */
 159 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 160 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 161 #endif
 162
 163 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 164 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 165 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 166 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 167 #define RE_utf8_perl_word   PL_utf8_alnum
 168 #define RE_utf8_perl_space  PL_utf8_space
 169 #define RE_utf8_posix_digit PL_utf8_digit
 170 #define perl_word  alnum
 171 #define perl_space space
 172 #define posix_digit digit
 173 #else
 174 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 175 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 176 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 177 #define RE_utf8_perl_word   PL_utf8_perl_word
 178 #define RE_utf8_perl_space  PL_utf8_perl_space
 179 #define RE_utf8_posix_digit PL_utf8_posix_digit
 180 #endif
 181
 182
 183 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
 184   case NAMEL:                                                              \
 185    PL_reg_flags |= RF_tainted;                                                 \
 186    /* FALL THROUGH */                                                          \
 187   case NAME:                                                                     \
 188    if (!nextchr)                                                               \
 189     sayNO;                                                                  \
 190    if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 191     if (!CAT2(PL_utf8_,CLASS)) {                                            \
 192      bool ok;                                                            \
 193      ENTER;                                                              \
 194      save_re_context();                                                  \
 195      ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 196      assert(ok);                                                         \
 197      LEAVE;                                                              \
 198     }                                                                       \
 199     if (!(OP(scan) == NAME                                                  \
 200      ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)   \
 201      : LCFUNC_utf8((U8*)locinput)))                                      \
 202     {                                                                       \
 203      sayNO;                                                              \
 204     }                                                                       \
 205     locinput += PL_utf8skip[nextchr];                                       \
 206     nextchr = UCHARAT(locinput);                                            \
 207     break;                                                                  \
 208    }                                                                           \
 209    if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
 210     sayNO;                                                                  \
 211    nextchr = UCHARAT(++locinput);                                              \
 212    break
 213
 214 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
 215   case NAMEL:                                                              \
 216    PL_reg_flags |= RF_tainted;                                                 \
 217    /* FALL THROUGH */                                                          \
 218   case NAME :                                                                     \
 219    if (!nextchr && locinput >= PL_regeol)                                      \
 220     sayNO;                                                                  \
 221    if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 222     if (!CAT2(PL_utf8_,CLASS)) {                                            \
 223      bool ok;                                                            \
 224      ENTER;                                                              \
 225      save_re_context();                                                  \
 226      ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 227      assert(ok);                                                         \
 228      LEAVE;                                                              \
 229     }                                                                       \
 230     if ((OP(scan) == NAME                                                  \
 231      ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)    \
 232      : LCFUNC_utf8((U8*)locinput)))                                      \
 233     {                                                                       \
 234      sayNO;                                                              \
 235     }                                                                       \
 236     locinput += PL_utf8skip[nextchr];                                       \
 237     nextchr = UCHARAT(locinput);                                            \
 238     break;                                                                  \
 239    }                                                                           \
 240    if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
 241     sayNO;                                                                  \
 242    nextchr = UCHARAT(++locinput);                                              \
 243    break
 244
 245
 246
 247
 248
 249 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 250
 251 /* for use after a quantifier and before an EXACT-like node -- japhy */
 252 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
 253 #define JUMPABLE(rn) (      \
 254  OP(rn) == OPEN ||       \
 255  (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 256  OP(rn) == EVAL ||   \
 257  OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 258  OP(rn) == PLUS || OP(rn) == MINMOD || \
 259  OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
 260  (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 261 )
 262 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 263
 264 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 265
 266 #if 0
 267 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 268    we don't need this definition. */
 269 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 270 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 271 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 272
 273 #else
 274 /* ... so we use this as its faster. */
 275 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 276 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 277 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 278
 279 #endif
 280
 281 /*
 282   Search for mandatory following text node; for lookahead, the text must
 283   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 284 */
 285 #define FIND_NEXT_IMPT(rn) STMT_START { \
 286  while (JUMPABLE(rn)) { \
 287   const OPCODE type = OP(rn); \
 288   if (type == SUSPEND || PL_regkind[type] == CURLY) \
 289    rn = NEXTOPER(NEXTOPER(rn)); \
 290   else if (type == PLUS) \
 291    rn = NEXTOPER(rn); \
 292   else if (type == IFMATCH) \
 293    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 294   else rn += NEXT_OFF(rn); \
 295  } \
 296 } STMT_END
 297
 298
 299 static void restore_pos(pTHX_ void *arg);
 300
 301 STATIC CHECKPOINT
 302 S_regcppush(pTHX_ I32 parenfloor)
 303 {
 304  dVAR;
 305  const int retval = PL_savestack_ix;
 306 #define REGCP_PAREN_ELEMS 4
 307  const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 308  int p;
 309  GET_RE_DEBUG_FLAGS_DECL;
 310
 311  if (paren_elems_to_push < 0)
 312   Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 313
 314 #define REGCP_OTHER_ELEMS 7
 315  SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
 316
 317  for (p = PL_regsize; p > parenfloor; p--) {
 318 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 319   SSPUSHINT(PL_regoffs[p].end);
 320   SSPUSHINT(PL_regoffs[p].start);
 321   SSPUSHPTR(PL_reg_start_tmp[p]);
 322   SSPUSHINT(p);
 323   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 324   "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 325      (UV)p, (IV)PL_regoffs[p].start,
 326      (IV)(PL_reg_start_tmp[p] - PL_bostr),
 327      (IV)PL_regoffs[p].end
 328   ));
 329  }
 330 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 331  SSPUSHPTR(PL_regoffs);
 332  SSPUSHINT(PL_regsize);
 333  SSPUSHINT(*PL_reglastparen);
 334  SSPUSHINT(*PL_reglastcloseparen);
 335  SSPUSHPTR(PL_reginput);
 336 #define REGCP_FRAME_ELEMS 2
 337 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
 338  * are needed for the regexp context stack bookkeeping. */
 339  SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
 340  SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
 341
 342  return retval;
 343 }
 344
 345 /* These are needed since we do not localize EVAL nodes: */
 346 #define REGCP_SET(cp)                                           \
 347  DEBUG_STATE_r(                                              \
 348    PerlIO_printf(Perl_debug_log,          \
 349     "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 350     (IV)PL_savestack_ix));                          \
 351  cp = PL_savestack_ix
 352
 353 #define REGCP_UNWIND(cp)                                        \
 354  DEBUG_STATE_r(                                              \
 355   if (cp != PL_savestack_ix)                   \
 356     PerlIO_printf(Perl_debug_log,          \
 357     "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 358     (IV)(cp), (IV)PL_savestack_ix));                \
 359  regcpblow(cp)
 360
 361 STATIC char *
 362 S_regcppop(pTHX_ const regexp *rex)
 363 {
 364  dVAR;
 365  U32 i;
 366  char *input;
 367  GET_RE_DEBUG_FLAGS_DECL;
 368
 369  PERL_ARGS_ASSERT_REGCPPOP;
 370
 371  /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 372  i = SSPOPINT;
 373  assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 374  i = SSPOPINT; /* Parentheses elements to pop. */
 375  input = (char *) SSPOPPTR;
 376  *PL_reglastcloseparen = SSPOPINT;
 377  *PL_reglastparen = SSPOPINT;
 378  PL_regsize = SSPOPINT;
 379  PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 380
 381
 382  /* Now restore the parentheses context. */
 383  for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
 384   i > 0; i -= REGCP_PAREN_ELEMS) {
 385   I32 tmps;
 386   U32 paren = (U32)SSPOPINT;
 387   PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 388   PL_regoffs[paren].start = SSPOPINT;
 389   tmps = SSPOPINT;
 390   if (paren <= *PL_reglastparen)
 391    PL_regoffs[paren].end = tmps;
 392   DEBUG_BUFFERS_r(
 393    PerlIO_printf(Perl_debug_log,
 394       "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 395       (UV)paren, (IV)PL_regoffs[paren].start,
 396       (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 397       (IV)PL_regoffs[paren].end,
 398       (paren > *PL_reglastparen ? "(no)" : ""));
 399   );
 400  }
 401  DEBUG_BUFFERS_r(
 402   if (*PL_reglastparen + 1 <= rex->nparens) {
 403    PerlIO_printf(Perl_debug_log,
 404       "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 405       (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 406   }
 407  );
 408 #if 1
 409  /* It would seem that the similar code in regtry()
 410  * already takes care of this, and in fact it is in
 411  * a better location to since this code can #if 0-ed out
 412  * but the code in regtry() is needed or otherwise tests
 413  * requiring null fields (pat.t#187 and split.t#{13,14}
 414  * (as of patchlevel 7877)  will fail.  Then again,
 415  * this code seems to be necessary or otherwise
 416  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 417  * --jhi updated by dapm */
 418  for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 419   if (i > PL_regsize)
 420    PL_regoffs[i].start = -1;
 421   PL_regoffs[i].end = -1;
 422  }
 423 #endif
 424  return input;
 425 }
 426
 427 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
 428
 429 /*
 430  * pregexec and friends
 431  */
 432
 433 #ifndef PERL_IN_XSUB_RE
 434 /*
 435  - pregexec - match a regexp against a string
 436  */
 437 I32
 438 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 439   char *strbeg, I32 minend, SV *screamer, U32 nosave)
 440 /* strend: pointer to null at end of string */
 441 /* strbeg: real beginning of string */
 442 /* minend: end of match must be >=minend after stringarg. */
 443 /* nosave: For optimizations. */
 444 {
 445  PERL_ARGS_ASSERT_PREGEXEC;
 446
 447  return
 448   regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 449      nosave ? 0 : REXEC_COPY_STR);
 450 }
 451 #endif
 452
 453 /*
 454  * Need to implement the following flags for reg_anch:
 455  *
 456  * USE_INTUIT_NOML  - Useful to call re_intuit_start() first
 457  * USE_INTUIT_ML
 458  * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
 459  * INTUIT_AUTORITATIVE_ML
 460  * INTUIT_ONCE_NOML  - Intuit can match in one location only.
 461  * INTUIT_ONCE_ML
 462  *
 463  * Another flag for this function: SECOND_TIME (so that float substrs
 464  * with giant delta may be not rechecked).
 465  */
 466
 467 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 468
 469 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 470    Otherwise, only SvCUR(sv) is used to get strbeg. */
 471
 472 /* XXXX We assume that strpos is strbeg unless sv. */
 473
 474 /* XXXX Some places assume that there is a fixed substring.
 475   An update may be needed if optimizer marks as "INTUITable"
 476   RExen without fixed substrings.  Similarly, it is assumed that
 477   lengths of all the strings are no more than minlen, thus they
 478   cannot come from lookahead.
 479   (Or minlen should take into account lookahead.)
 480   NOTE: Some of this comment is not correct. minlen does now take account
 481   of lookahead/behind. Further research is required. -- demerphq
 482
 483 */
 484
 485 /* A failure to find a constant substring means that there is no need to make
 486    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 487    finding a substring too deep into the string means that less calls to
 488    regtry() should be needed.
 489
 490    REx compiler's optimizer found 4 possible hints:
 491   a) Anchored substring;
 492   b) Fixed substring;
 493   c) Whether we are anchored (beginning-of-line or \G);
 494   d) First node (of those at offset 0) which may distingush positions;
 495    We use a)b)d) and multiline-part of c), and try to find a position in the
 496    string which does not contradict any of them.
 497  */
 498
 499 /* Most of decisions we do here should have been done at compile time.
 500    The nodes of the REx which we used for the search should have been
 501    deleted from the finite automaton. */
 502
 503 char *
 504 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 505      char *strend, const U32 flags, re_scream_pos_data *data)
 506 {
 507  dVAR;
 508  struct regexp *const prog = (struct regexp *)SvANY(rx);
 509  register I32 start_shift = 0;
 510  /* Should be nonnegative! */
 511  register I32 end_shift   = 0;
 512  register char *s;
 513  register SV *check;
 514  char *strbeg;
 515  char *t;
 516  const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 517  I32 ml_anch;
 518  register char *other_last = NULL; /* other substr checked before this */
 519  char *check_at = NULL;  /* check substr found at this pos */
 520  const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 521  RXi_GET_DECL(prog,progi);
 522 #ifdef DEBUGGING
 523  const char * const i_strpos = strpos;
 524 #endif
 525  GET_RE_DEBUG_FLAGS_DECL;
 526
 527  PERL_ARGS_ASSERT_RE_INTUIT_START;
 528
 529  RX_MATCH_UTF8_set(rx,do_utf8);
 530
 531  if (RX_UTF8(rx)) {
 532   PL_reg_flags |= RF_utf8;
 533  }
 534  DEBUG_EXECUTE_r(
 535   debug_start_match(rx, do_utf8, strpos, strend,
 536    sv ? "Guessing start of match in sv for"
 537    : "Guessing start of match in string for");
 538    );
 539
 540  /* CHR_DIST() would be more correct here but it makes things slow. */
 541  if (prog->minlen > strend - strpos) {
 542   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 543        "String too short... [re_intuit_start]\n"));
 544   goto fail;
 545  }
 546
 547  strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 548  PL_regeol = strend;
 549  if (do_utf8) {
 550   if (!prog->check_utf8 && prog->check_substr)
 551    to_utf8_substr(prog);
 552   check = prog->check_utf8;
 553  } else {
 554   if (!prog->check_substr && prog->check_utf8)
 555    to_byte_substr(prog);
 556   check = prog->check_substr;
 557  }
 558  if (check == &PL_sv_undef) {
 559   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 560     "Non-utf8 string cannot match utf8 check string\n"));
 561   goto fail;
 562  }
 563  if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
 564   ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 565      || ( (prog->extflags & RXf_ANCH_BOL)
 566       && !multiline ) ); /* Check after \n? */
 567
 568   if (!ml_anch) {
 569   if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 570     && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 571    /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 572    && sv && !SvROK(sv)
 573    && (strpos != strbeg)) {
 574    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 575    goto fail;
 576   }
 577   if (prog->check_offset_min == prog->check_offset_max &&
 578    !(prog->extflags & RXf_CANY_SEEN)) {
 579    /* Substring at constant offset from beg-of-str... */
 580    I32 slen;
 581
 582    s = HOP3c(strpos, prog->check_offset_min, strend);
 583
 584    if (SvTAIL(check)) {
 585     slen = SvCUR(check); /* >= 1 */
 586
 587     if ( strend - s > slen || strend - s < slen - 1
 588      || (strend - s == slen && strend[-1] != '\n')) {
 589      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 590      goto fail_finish;
 591     }
 592     /* Now should match s[0..slen-2] */
 593     slen--;
 594     if (slen && (*SvPVX_const(check) != *s
 595        || (slen > 1
 596         && memNE(SvPVX_const(check), s, slen)))) {
 597     report_neq:
 598      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 599      goto fail_finish;
 600     }
 601    }
 602    else if (*SvPVX_const(check) != *s
 603      || ((slen = SvCUR(check)) > 1
 604       && memNE(SvPVX_const(check), s, slen)))
 605     goto report_neq;
 606    check_at = s;
 607    goto success_at_start;
 608   }
 609   }
 610   /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 611   s = strpos;
 612   start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 613   end_shift = prog->check_end_shift;
 614
 615   if (!ml_anch) {
 616    const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 617           - (SvTAIL(check) != 0);
 618    const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 619
 620    if (end_shift < eshift)
 621     end_shift = eshift;
 622   }
 623  }
 624  else {    /* Can match at random position */
 625   ml_anch = 0;
 626   s = strpos;
 627   start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 628   end_shift = prog->check_end_shift;
 629
 630   /* end shift should be non negative here */
 631  }
 632
 633 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
 634  if (end_shift < 0)
 635   Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 636     (IV)end_shift, RX_PRECOMP(prog));
 637 #endif
 638
 639   restart:
 640  /* Find a possible match in the region s..strend by looking for
 641  the "check" substring in the region corrected by start/end_shift. */
 642
 643  {
 644   I32 srch_start_shift = start_shift;
 645   I32 srch_end_shift = end_shift;
 646   if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 647    srch_end_shift -= ((strbeg - s) - srch_start_shift);
 648    srch_start_shift = strbeg - s;
 649   }
 650  DEBUG_OPTIMISE_MORE_r({
 651   PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 652    (IV)prog->check_offset_min,
 653    (IV)srch_start_shift,
 654    (IV)srch_end_shift,
 655    (IV)prog->check_end_shift);
 656  });
 657
 658  if (flags & REXEC_SCREAM) {
 659   I32 p = -1;   /* Internal iterator of scream. */
 660   I32 * const pp = data ? data->scream_pos : &p;
 661
 662   if (PL_screamfirst[BmRARE(check)] >= 0
 663    || ( BmRARE(check) == '\n'
 664     && (BmPREVIOUS(check) == SvCUR(check) - 1)
 665     && SvTAIL(check) ))
 666    s = screaminstr(sv, check,
 667        srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 668   else
 669    goto fail_finish;
 670   /* we may be pointing at the wrong string */
 671   if (s && RXp_MATCH_COPIED(prog))
 672    s = strbeg + (s - SvPVX_const(sv));
 673   if (data)
 674    *data->scream_olds = s;
 675  }
 676  else {
 677   U8* start_point;
 678   U8* end_point;
 679   if (prog->extflags & RXf_CANY_SEEN) {
 680    start_point= (U8*)(s + srch_start_shift);
 681    end_point= (U8*)(strend - srch_end_shift);
 682   } else {
 683    start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 684    end_point= HOP3(strend, -srch_end_shift, strbeg);
 685   }
 686   DEBUG_OPTIMISE_MORE_r({
 687    PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 688     (int)(end_point - start_point),
 689     (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 690     start_point);
 691   });
 692
 693   s = fbm_instr( start_point, end_point,
 694      check, multiline ? FBMrf_MULTILINE : 0);
 695  }
 696  }
 697  /* Update the count-of-usability, remove useless subpatterns,
 698   unshift s.  */
 699
 700  DEBUG_EXECUTE_r({
 701   RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 702    SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 703   PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 704       (s ? "Found" : "Did not find"),
 705    (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
 706     ? "anchored" : "floating"),
 707    quoted,
 708    RE_SV_TAIL(check),
 709    (s ? " at offset " : "...\n") );
 710  });
 711
 712  if (!s)
 713   goto fail_finish;
 714  /* Finish the diagnostic message */
 715  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 716
 717  /* XXX dmq: first branch is for positive lookbehind...
 718  Our check string is offset from the beginning of the pattern.
 719  So we need to do any stclass tests offset forward from that
 720  point. I think. :-(
 721  */
 722
 723
 724
 725  check_at=s;
 726
 727
 728  /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 729  Start with the other substr.
 730  XXXX no SCREAM optimization yet - and a very coarse implementation
 731  XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 732     *always* match.  Probably should be marked during compile...
 733  Probably it is right to do no SCREAM here...
 734  */
 735
 736  if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
 737     : (prog->float_substr && prog->anchored_substr))
 738  {
 739   /* Take into account the "other" substring. */
 740   /* XXXX May be hopelessly wrong for UTF... */
 741   if (!other_last)
 742    other_last = strpos;
 743   if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
 744   do_other_anchored:
 745    {
 746     char * const last = HOP3c(s, -start_shift, strbeg);
 747     char *last1, *last2;
 748     char * const saved_s = s;
 749     SV* must;
 750
 751     t = s - prog->check_offset_max;
 752     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 753      && (!do_utf8
 754       || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 755        && t > strpos)))
 756      NOOP;
 757     else
 758      t = strpos;
 759     t = HOP3c(t, prog->anchored_offset, strend);
 760     if (t < other_last) /* These positions already checked */
 761      t = other_last;
 762     last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 763     if (last < last1)
 764      last1 = last;
 765     /* XXXX It is not documented what units *_offsets are in.
 766     We assume bytes, but this is clearly wrong.
 767     Meaning this code needs to be carefully reviewed for errors.
 768     dmq.
 769     */
 770
 771     /* On end-of-str: see comment below. */
 772     must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
 773     if (must == &PL_sv_undef) {
 774      s = (char*)NULL;
 775      DEBUG_r(must = prog->anchored_utf8); /* for debug */
 776     }
 777     else
 778      s = fbm_instr(
 779       (unsigned char*)t,
 780       HOP3(HOP3(last1, prog->anchored_offset, strend)
 781         + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 782       must,
 783       multiline ? FBMrf_MULTILINE : 0
 784      );
 785     DEBUG_EXECUTE_r({
 786      RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 787       SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 788      PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 789       (s ? "Found" : "Contradicts"),
 790       quoted, RE_SV_TAIL(must));
 791     });
 792
 793
 794     if (!s) {
 795      if (last1 >= last2) {
 796       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 797             ", giving up...\n"));
 798       goto fail_finish;
 799      }
 800      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 801       ", trying floating at offset %ld...\n",
 802       (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 803      other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 804      s = HOP3c(last, 1, strend);
 805      goto restart;
 806     }
 807     else {
 808      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 809       (long)(s - i_strpos)));
 810      t = HOP3c(s, -prog->anchored_offset, strbeg);
 811      other_last = HOP3c(s, 1, strend);
 812      s = saved_s;
 813      if (t == strpos)
 814       goto try_at_start;
 815      goto try_at_offset;
 816     }
 817    }
 818   }
 819   else {  /* Take into account the floating substring. */
 820    char *last, *last1;
 821    char * const saved_s = s;
 822    SV* must;
 823
 824    t = HOP3c(s, -start_shift, strbeg);
 825    last1 = last =
 826     HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 827    if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 828     last = HOP3c(t, prog->float_max_offset, strend);
 829    s = HOP3c(t, prog->float_min_offset, strend);
 830    if (s < other_last)
 831     s = other_last;
 832  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 833    must = do_utf8 ? prog->float_utf8 : prog->float_substr;
 834    /* fbm_instr() takes into account exact value of end-of-str
 835    if the check is SvTAIL(ed).  Since false positives are OK,
 836    and end-of-str is not later than strend we are OK. */
 837    if (must == &PL_sv_undef) {
 838     s = (char*)NULL;
 839     DEBUG_r(must = prog->float_utf8); /* for debug message */
 840    }
 841    else
 842     s = fbm_instr((unsigned char*)s,
 843        (unsigned char*)last + SvCUR(must)
 844         - (SvTAIL(must)!=0),
 845        must, multiline ? FBMrf_MULTILINE : 0);
 846    DEBUG_EXECUTE_r({
 847     RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 848      SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 849     PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 850      (s ? "Found" : "Contradicts"),
 851      quoted, RE_SV_TAIL(must));
 852    });
 853    if (!s) {
 854     if (last1 == last) {
 855      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 856            ", giving up...\n"));
 857      goto fail_finish;
 858     }
 859     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 860      ", trying anchored starting at offset %ld...\n",
 861      (long)(saved_s + 1 - i_strpos)));
 862     other_last = last;
 863     s = HOP3c(t, 1, strend);
 864     goto restart;
 865    }
 866    else {
 867     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 868      (long)(s - i_strpos)));
 869     other_last = s; /* Fix this later. --Hugo */
 870     s = saved_s;
 871     if (t == strpos)
 872      goto try_at_start;
 873     goto try_at_offset;
 874    }
 875   }
 876  }
 877
 878
 879  t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 880
 881  DEBUG_OPTIMISE_MORE_r(
 882   PerlIO_printf(Perl_debug_log,
 883    "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 884    (IV)prog->check_offset_min,
 885    (IV)prog->check_offset_max,
 886    (IV)(s-strpos),
 887    (IV)(t-strpos),
 888    (IV)(t-s),
 889    (IV)(strend-strpos)
 890   )
 891  );
 892
 893  if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 894   && (!do_utf8
 895    || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 896     && t > strpos)))
 897  {
 898   /* Fixed substring is found far enough so that the match
 899   cannot start at strpos. */
 900  try_at_offset:
 901   if (ml_anch && t[-1] != '\n') {
 902    /* Eventually fbm_*() should handle this, but often
 903    anchored_offset is not 0, so this check will not be wasted. */
 904    /* XXXX In the code below we prefer to look for "^" even in
 905    presence of anchored substrings.  And we search even
 906    beyond the found float position.  These pessimizations
 907    are historical artefacts only.  */
 908   find_anchor:
 909    while (t < strend - prog->minlen) {
 910     if (*t == '\n') {
 911      if (t < check_at - prog->check_offset_min) {
 912       if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
 913        /* Since we moved from the found position,
 914        we definitely contradict the found anchored
 915        substr.  Due to the above check we do not
 916        contradict "check" substr.
 917        Thus we can arrive here only if check substr
 918        is float.  Redo checking for "other"=="fixed".
 919        */
 920        strpos = t + 1;
 921        DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 922         PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 923        goto do_other_anchored;
 924       }
 925       /* We don't contradict the found floating substring. */
 926       /* XXXX Why not check for STCLASS? */
 927       s = t + 1;
 928       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 929        PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 930       goto set_useful;
 931      }
 932      /* Position contradicts check-string */
 933      /* XXXX probably better to look for check-string
 934      than for "\n", so one should lower the limit for t? */
 935      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 936       PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 937      other_last = strpos = s = t + 1;
 938      goto restart;
 939     }
 940     t++;
 941    }
 942    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 943       PL_colors[0], PL_colors[1]));
 944    goto fail_finish;
 945   }
 946   else {
 947    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 948       PL_colors[0], PL_colors[1]));
 949   }
 950   s = t;
 951  set_useful:
 952   ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
 953  }
 954  else {
 955   /* The found string does not prohibit matching at strpos,
 956   - no optimization of calling REx engine can be performed,
 957   unless it was an MBOL and we are not after MBOL,
 958   or a future STCLASS check will fail this. */
 959  try_at_start:
 960   /* Even in this situation we may use MBOL flag if strpos is offset
 961   wrt the start of the string. */
 962   if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 963    && (strpos != strbeg) && strpos[-1] != '\n'
 964    /* May be due to an implicit anchor of m{.*foo}  */
 965    && !(prog->intflags & PREGf_IMPLICIT))
 966   {
 967    t = strpos;
 968    goto find_anchor;
 969   }
 970   DEBUG_EXECUTE_r( if (ml_anch)
 971    PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
 972       (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
 973   );
 974  success_at_start:
 975   if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
 976    && (do_utf8 ? (
 977     prog->check_utf8  /* Could be deleted already */
 978     && --BmUSEFUL(prog->check_utf8) < 0
 979     && (prog->check_utf8 == prog->float_utf8)
 980    ) : (
 981     prog->check_substr  /* Could be deleted already */
 982     && --BmUSEFUL(prog->check_substr) < 0
 983     && (prog->check_substr == prog->float_substr)
 984    )))
 985   {
 986    /* If flags & SOMETHING - do not do it many times on the same match */
 987    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
 988    /* XXX Does the destruction order has to change with do_utf8? */
 989    SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
 990    SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
 991    prog->check_substr = prog->check_utf8 = NULL; /* disable */
 992    prog->float_substr = prog->float_utf8 = NULL; /* clear */
 993    check = NULL;   /* abort */
 994    s = strpos;
 995    /* XXXX This is a remnant of the old implementation.  It
 996      looks wasteful, since now INTUIT can use many
 997      other heuristics. */
 998    prog->extflags &= ~RXf_USE_INTUIT;
 999   }
1000   else
1001    s = strpos;
1002  }
1003
1004  /* Last resort... */
1005  /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1006  /* trie stclasses are too expensive to use here, we are better off to
1007  leave it to regmatch itself */
1008  if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1009   /* minlen == 0 is possible if regstclass is \b or \B,
1010   and the fixed substr is ''$.
1011   Since minlen is already taken into account, s+1 is before strend;
1012   accidentally, minlen >= 1 guaranties no false positives at s + 1
1013   even for \b or \B.  But (minlen? 1 : 0) below assumes that
1014   regstclass does not come from lookahead...  */
1015   /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1016   This leaves EXACTF only, which is dealt with in find_byclass().  */
1017   const U8* const str = (U8*)STRING(progi->regstclass);
1018   const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1019      ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1020      : 1);
1021   char * endpos;
1022   if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1023    endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1024   else if (prog->float_substr || prog->float_utf8)
1025    endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1026   else
1027    endpos= strend;
1028
1029   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1030          (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1031
1032   t = s;
1033   s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1034   if (!s) {
1035 #ifdef DEBUGGING
1036    const char *what = NULL;
1037 #endif
1038    if (endpos == strend) {
1039     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1040         "Could not match STCLASS...\n") );
1041     goto fail;
1042    }
1043    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1044         "This position contradicts STCLASS...\n") );
1045    if ((prog->extflags & RXf_ANCH) && !ml_anch)
1046     goto fail;
1047    /* Contradict one of substrings */
1048    if (prog->anchored_substr || prog->anchored_utf8) {
1049     if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1050      DEBUG_EXECUTE_r( what = "anchored" );
1051     hop_and_restart:
1052      s = HOP3c(t, 1, strend);
1053      if (s + start_shift + end_shift > strend) {
1054       /* XXXX Should be taken into account earlier? */
1055       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1056            "Could not match STCLASS...\n") );
1057       goto fail;
1058      }
1059      if (!check)
1060       goto giveup;
1061      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1062         "Looking for %s substr starting at offset %ld...\n",
1063         what, (long)(s + start_shift - i_strpos)) );
1064      goto restart;
1065     }
1066     /* Have both, check_string is floating */
1067     if (t + start_shift >= check_at) /* Contradicts floating=check */
1068      goto retry_floating_check;
1069     /* Recheck anchored substring, but not floating... */
1070     s = check_at;
1071     if (!check)
1072      goto giveup;
1073     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1074       "Looking for anchored substr starting at offset %ld...\n",
1075       (long)(other_last - i_strpos)) );
1076     goto do_other_anchored;
1077    }
1078    /* Another way we could have checked stclass at the
1079    current position only: */
1080    if (ml_anch) {
1081     s = t = t + 1;
1082     if (!check)
1083      goto giveup;
1084     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1085       "Looking for /%s^%s/m starting at offset %ld...\n",
1086       PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1087     goto try_at_offset;
1088    }
1089    if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1090     goto fail;
1091    /* Check is floating subtring. */
1092   retry_floating_check:
1093    t = check_at - start_shift;
1094    DEBUG_EXECUTE_r( what = "floating" );
1095    goto hop_and_restart;
1096   }
1097   if (t != s) {
1098    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1099       "By STCLASS: moving %ld --> %ld\n",
1100         (long)(t - i_strpos), (long)(s - i_strpos))
1101     );
1102   }
1103   else {
1104    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1105         "Does not contradict STCLASS...\n");
1106     );
1107   }
1108  }
1109   giveup:
1110  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1111       PL_colors[4], (check ? "Guessed" : "Giving up"),
1112       PL_colors[5], (long)(s - i_strpos)) );
1113  return s;
1114
1115   fail_finish:    /* Substring not found */
1116  if (prog->check_substr || prog->check_utf8)  /* could be removed already */
1117   BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1118   fail:
1119  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1120       PL_colors[4], PL_colors[5]));
1121  return NULL;
1122 }
1123
1124 #define DECL_TRIE_TYPE(scan) \
1125  const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1126      trie_type = (scan->flags != EXACT) \
1127        ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1128        : (do_utf8 ? trie_utf8 : trie_plain)
1129
1130 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1131 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1132  switch (trie_type) {                                                    \
1133  case trie_utf8_fold:                                                    \
1134   if ( foldlen>0 ) {                                                  \
1135    uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1136    foldlen -= len;                                                 \
1137    uscan += len;                                                   \
1138    len=0;                                                          \
1139   } else {                                                            \
1140    uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1141    uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1142    foldlen -= UNISKIP( uvc );                                      \
1143    uscan = foldbuf + UNISKIP( uvc );                               \
1144   }                                                                   \
1145   break;                                                              \
1146  case trie_latin_utf8_fold:                                              \
1147   if ( foldlen>0 ) {                                                  \
1148    uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1149    foldlen -= len;                                                 \
1150    uscan += len;                                                   \
1151    len=0;                                                          \
1152   } else {                                                            \
1153    len = 1;                                                        \
1154    uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1155    foldlen -= UNISKIP( uvc );                                      \
1156    uscan = foldbuf + UNISKIP( uvc );                               \
1157   }                                                                   \
1158   break;                                                              \
1159  case trie_utf8:                                                         \
1160   uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1161   break;                                                              \
1162  case trie_plain:                                                        \
1163   uvc = (UV)*uc;                                                      \
1164   len = 1;                                                            \
1165  }                                                                       \
1166  if (uvc < 256) {                                                        \
1167   charid = trie->charmap[ uvc ];                                      \
1168  }                                                                       \
1169  else {                                                                  \
1170   charid = 0;                                                         \
1171   if (widecharmap) {                                                  \
1172    SV** const svpp = hv_fetch(widecharmap,                         \
1173       (char*)&uvc, sizeof(UV), 0);                        \
1174    if (svpp)                                                       \
1175     charid = (U16)SvIV(*svpp);                                  \
1176   }                                                                   \
1177  }                                                                       \
1178 } STMT_END
1179
1180 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1181 {                                                      \
1182  char *my_strend= (char *)strend;                   \
1183  if ( (CoNd)                                        \
1184   && (ln == len ||                              \
1185    !ibcmp_utf8(s, &my_strend, 0,  do_utf8,   \
1186       m, NULL, ln, (bool)UTF))       \
1187   && (!reginfo || regtry(reginfo, &s)) )        \
1188   goto got_it;                                   \
1189  else {                                             \
1190   U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1191   uvchr_to_utf8(tmpbuf, c);                     \
1192   f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1193   if ( f != c                                   \
1194    && (f == c1 || f == c2)                  \
1195    && (ln == len ||                         \
1196     !ibcmp_utf8(s, &my_strend, 0,  do_utf8,\
1197        m, NULL, ln, (bool)UTF)) \
1198    && (!reginfo || regtry(reginfo, &s)) )   \
1199    goto got_it;                             \
1200  }                                                  \
1201 }                                                      \
1202 s += len
1203
1204 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1205 STMT_START {                                              \
1206  while (s <= e) {                                      \
1207   if ( (CoNd)                                       \
1208    && (ln == 1 || !(OP(c) == EXACTF             \
1209        ? ibcmp(s, m, ln)           \
1210        : ibcmp_locale(s, m, ln)))  \
1211    && (!reginfo || regtry(reginfo, &s)) )        \
1212    goto got_it;                                  \
1213   s++;                                              \
1214  }                                                     \
1215 } STMT_END
1216
1217 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1218 STMT_START {                                          \
1219  while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1220   CoDe                                          \
1221   s += uskip;                                   \
1222  }                                                 \
1223 } STMT_END
1224
1225 #define REXEC_FBC_SCAN(CoDe)                          \
1226 STMT_START {                                          \
1227  while (s < strend) {                              \
1228   CoDe                                          \
1229   s++;                                          \
1230  }                                                 \
1231 } STMT_END
1232
1233 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1234 REXEC_FBC_UTF8_SCAN(                                  \
1235  if (CoNd) {                                       \
1236   if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1237    goto got_it;                              \
1238   else                                          \
1239    tmp = doevery;                            \
1240  }                                                 \
1241  else                                              \
1242   tmp = 1;                                      \
1243 )
1244
1245 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1246 REXEC_FBC_SCAN(                                       \
1247  if (CoNd) {                                       \
1248   if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1249    goto got_it;                              \
1250   else                                          \
1251    tmp = doevery;                            \
1252  }                                                 \
1253  else                                              \
1254   tmp = 1;                                      \
1255 )
1256
1257 #define REXEC_FBC_TRYIT               \
1258 if ((!reginfo || regtry(reginfo, &s))) \
1259  goto got_it
1260
1261 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1262  if (do_utf8) {                                             \
1263   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1264  }                                                          \
1265  else {                                                     \
1266   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1267  }                                                          \
1268  break
1269
1270 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1271  if (do_utf8) {                                             \
1272   UtFpReLoAd;                                            \
1273   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1274  }                                                          \
1275  else {                                                     \
1276   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1277  }                                                          \
1278  break
1279
1280 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1281  PL_reg_flags |= RF_tainted;                                \
1282  if (do_utf8) {                                             \
1283   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1284  }                                                          \
1285  else {                                                     \
1286   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1287  }                                                          \
1288  break
1289
1290 #define DUMP_EXEC_POS(li,s,doutf8) \
1291  dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1292
1293 /* We know what class REx starts with.  Try to find this position... */
1294 /* if reginfo is NULL, its a dryrun */
1295 /* annoyingly all the vars in this routine have different names from their counterparts
1296    in regmatch. /grrr */
1297
1298 STATIC char *
1299 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1300  const char *strend, regmatch_info *reginfo)
1301 {
1302   dVAR;
1303   const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1304   char *m;
1305   STRLEN ln;
1306   STRLEN lnc;
1307   register STRLEN uskip;
1308   unsigned int c1;
1309   unsigned int c2;
1310   char *e;
1311   register I32 tmp = 1; /* Scratch variable? */
1312   register const bool do_utf8 = PL_reg_match_utf8;
1313   RXi_GET_DECL(prog,progi);
1314
1315   PERL_ARGS_ASSERT_FIND_BYCLASS;
1316
1317   /* We know what class it must start with. */
1318   switch (OP(c)) {
1319   case ANYOF:
1320    if (do_utf8) {
1321     REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1322       !UTF8_IS_INVARIANT((U8)s[0]) ?
1323       reginclass(prog, c, (U8*)s, 0, do_utf8) :
1324       REGINCLASS(prog, c, (U8*)s));
1325    }
1326    else {
1327     while (s < strend) {
1328      STRLEN skip = 1;
1329
1330      if (REGINCLASS(prog, c, (U8*)s) ||
1331       (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1332       /* The assignment of 2 is intentional:
1333        * for the folded sharp s, the skip is 2. */
1334       (skip = SHARP_S_SKIP))) {
1335       if (tmp && (!reginfo || regtry(reginfo, &s)))
1336         goto got_it;
1337       else
1338         tmp = doevery;
1339      }
1340      else
1341       tmp = 1;
1342      s += skip;
1343     }
1344    }
1345    break;
1346   case CANY:
1347    REXEC_FBC_SCAN(
1348     if (tmp && (!reginfo || regtry(reginfo, &s)))
1349      goto got_it;
1350     else
1351      tmp = doevery;
1352    );
1353    break;
1354   case EXACTF:
1355    m   = STRING(c);
1356    ln  = STR_LEN(c); /* length to match in octets/bytes */
1357    lnc = (I32) ln; /* length to match in characters */
1358    if (UTF) {
1359     STRLEN ulen1, ulen2;
1360     U8 *sm = (U8 *) m;
1361     U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1362     U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1363     /* used by commented-out code below */
1364     /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1365
1366     /* XXX: Since the node will be case folded at compile
1367     time this logic is a little odd, although im not
1368     sure that its actually wrong. --dmq */
1369
1370     c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1371     c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1372
1373     /* XXX: This is kinda strange. to_utf8_XYZ returns the
1374     codepoint of the first character in the converted
1375     form, yet originally we did the extra step.
1376     No tests fail by commenting this code out however
1377     so Ive left it out. -- dmq.
1378
1379     c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1380          0, uniflags);
1381     c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1382          0, uniflags);
1383     */
1384
1385     lnc = 0;
1386     while (sm < ((U8 *) m + ln)) {
1387      lnc++;
1388      sm += UTF8SKIP(sm);
1389     }
1390    }
1391    else {
1392     c1 = *(U8*)m;
1393     c2 = PL_fold[c1];
1394    }
1395    goto do_exactf;
1396   case EXACTFL:
1397    m   = STRING(c);
1398    ln  = STR_LEN(c);
1399    lnc = (I32) ln;
1400    c1 = *(U8*)m;
1401    c2 = PL_fold_locale[c1];
1402   do_exactf:
1403    e = HOP3c(strend, -((I32)lnc), s);
1404
1405    if (!reginfo && e < s)
1406     e = s;   /* Due to minlen logic of intuit() */
1407
1408    /* The idea in the EXACTF* cases is to first find the
1409    * first character of the EXACTF* node and then, if
1410    * necessary, case-insensitively compare the full
1411    * text of the node.  The c1 and c2 are the first
1412    * characters (though in Unicode it gets a bit
1413    * more complicated because there are more cases
1414    * than just upper and lower: one needs to use
1415    * the so-called folding case for case-insensitive
1416    * matching (called "loose matching" in Unicode).
1417    * ibcmp_utf8() will do just that. */
1418
1419    if (do_utf8 || UTF) {
1420     UV c, f;
1421     U8 tmpbuf [UTF8_MAXBYTES+1];
1422     STRLEN len = 1;
1423     STRLEN foldlen;
1424     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1425     if (c1 == c2) {
1426      /* Upper and lower of 1st char are equal -
1427      * probably not a "letter". */
1428      while (s <= e) {
1429       if (do_utf8) {
1430        c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1431           uniflags);
1432       } else {
1433        c = *((U8*)s);
1434       }
1435       REXEC_FBC_EXACTISH_CHECK(c == c1);
1436      }
1437     }
1438     else {
1439      while (s <= e) {
1440       if (do_utf8) {
1441        c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1442           uniflags);
1443       } else {
1444        c = *((U8*)s);
1445       }
1446
1447       /* Handle some of the three Greek sigmas cases.
1448       * Note that not all the possible combinations
1449       * are handled here: some of them are handled
1450       * by the standard folding rules, and some of
1451       * them (the character class or ANYOF cases)
1452       * are handled during compiletime in
1453       * regexec.c:S_regclass(). */
1454       if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1455        c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1456        c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1457
1458       REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1459      }
1460     }
1461    }
1462    else {
1463     /* Neither pattern nor string are UTF8 */
1464     if (c1 == c2)
1465      REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1466     else
1467      REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1468    }
1469    break;
1470   case BOUNDL:
1471    PL_reg_flags |= RF_tainted;
1472    /* FALL THROUGH */
1473   case BOUND:
1474    if (do_utf8) {
1475     if (s == PL_bostr)
1476      tmp = '\n';
1477     else {
1478      U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1479      tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1480     }
1481     tmp = ((OP(c) == BOUND ?
1482       isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1483     LOAD_UTF8_CHARCLASS_ALNUM();
1484     REXEC_FBC_UTF8_SCAN(
1485      if (tmp == !(OP(c) == BOUND ?
1486         (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1487         isALNUM_LC_utf8((U8*)s)))
1488      {
1489       tmp = !tmp;
1490       REXEC_FBC_TRYIT;
1491     }
1492     );
1493    }
1494    else {
1495     tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1496     tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1497     REXEC_FBC_SCAN(
1498      if (tmp ==
1499       !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1500       tmp = !tmp;
1501       REXEC_FBC_TRYIT;
1502     }
1503     );
1504    }
1505    if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1506     goto got_it;
1507    break;
1508   case NBOUNDL:
1509    PL_reg_flags |= RF_tainted;
1510    /* FALL THROUGH */
1511   case NBOUND:
1512    if (do_utf8) {
1513     if (s == PL_bostr)
1514      tmp = '\n';
1515     else {
1516      U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1517      tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1518     }
1519     tmp = ((OP(c) == NBOUND ?
1520       isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1521     LOAD_UTF8_CHARCLASS_ALNUM();
1522     REXEC_FBC_UTF8_SCAN(
1523      if (tmp == !(OP(c) == NBOUND ?
1524         (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1525         isALNUM_LC_utf8((U8*)s)))
1526       tmp = !tmp;
1527      else REXEC_FBC_TRYIT;
1528     );
1529    }
1530    else {
1531     tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1532     tmp = ((OP(c) == NBOUND ?
1533       isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1534     REXEC_FBC_SCAN(
1535      if (tmp ==
1536       !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1537       tmp = !tmp;
1538      else REXEC_FBC_TRYIT;
1539     );
1540    }
1541    if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1542     goto got_it;
1543    break;
1544   case ALNUM:
1545    REXEC_FBC_CSCAN_PRELOAD(
1546     LOAD_UTF8_CHARCLASS_PERL_WORD(),
1547     swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1548     isALNUM(*s)
1549    );
1550   case ALNUML:
1551    REXEC_FBC_CSCAN_TAINT(
1552     isALNUM_LC_utf8((U8*)s),
1553     isALNUM_LC(*s)
1554    );
1555   case NALNUM:
1556    REXEC_FBC_CSCAN_PRELOAD(
1557     LOAD_UTF8_CHARCLASS_PERL_WORD(),
1558     !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1559     !isALNUM(*s)
1560    );
1561   case NALNUML:
1562    REXEC_FBC_CSCAN_TAINT(
1563     !isALNUM_LC_utf8((U8*)s),
1564     !isALNUM_LC(*s)
1565    );
1566   case SPACE:
1567    REXEC_FBC_CSCAN_PRELOAD(
1568     LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1569     *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
1570     isSPACE(*s)
1571    );
1572   case SPACEL:
1573    REXEC_FBC_CSCAN_TAINT(
1574     *s == ' ' || isSPACE_LC_utf8((U8*)s),
1575     isSPACE_LC(*s)
1576    );
1577   case NSPACE:
1578    REXEC_FBC_CSCAN_PRELOAD(
1579     LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1580     !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
1581     !isSPACE(*s)
1582    );
1583   case NSPACEL:
1584    REXEC_FBC_CSCAN_TAINT(
1585     !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1586     !isSPACE_LC(*s)
1587    );
1588   case DIGIT:
1589    REXEC_FBC_CSCAN_PRELOAD(
1590     LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1591     swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1592     isDIGIT(*s)
1593    );
1594   case DIGITL:
1595    REXEC_FBC_CSCAN_TAINT(
1596     isDIGIT_LC_utf8((U8*)s),
1597     isDIGIT_LC(*s)
1598    );
1599   case NDIGIT:
1600    REXEC_FBC_CSCAN_PRELOAD(
1601     LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1602     !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1603     !isDIGIT(*s)
1604    );
1605   case NDIGITL:
1606    REXEC_FBC_CSCAN_TAINT(
1607     !isDIGIT_LC_utf8((U8*)s),
1608     !isDIGIT_LC(*s)
1609    );
1610   case LNBREAK:
1611    REXEC_FBC_CSCAN(
1612     is_LNBREAK_utf8(s),
1613     is_LNBREAK_latin1(s)
1614    );
1615   case VERTWS:
1616    REXEC_FBC_CSCAN(
1617     is_VERTWS_utf8(s),
1618     is_VERTWS_latin1(s)
1619    );
1620   case NVERTWS:
1621    REXEC_FBC_CSCAN(
1622     !is_VERTWS_utf8(s),
1623     !is_VERTWS_latin1(s)
1624    );
1625   case HORIZWS:
1626    REXEC_FBC_CSCAN(
1627     is_HORIZWS_utf8(s),
1628     is_HORIZWS_latin1(s)
1629    );
1630   case NHORIZWS:
1631    REXEC_FBC_CSCAN(
1632     !is_HORIZWS_utf8(s),
1633     !is_HORIZWS_latin1(s)
1634    );
1635   case AHOCORASICKC:
1636   case AHOCORASICK:
1637    {
1638     DECL_TRIE_TYPE(c);
1639     /* what trie are we using right now */
1640     reg_ac_data *aho
1641      = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1642     reg_trie_data *trie
1643      = (reg_trie_data*)progi->data->data[ aho->trie ];
1644     HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1645
1646     const char *last_start = strend - trie->minlen;
1647 #ifdef DEBUGGING
1648     const char *real_start = s;
1649 #endif
1650     STRLEN maxlen = trie->maxlen;
1651     SV *sv_points;
1652     U8 **points; /* map of where we were in the input string
1653         when reading a given char. For ASCII this
1654         is unnecessary overhead as the relationship
1655         is always 1:1, but for Unicode, especially
1656         case folded Unicode this is not true. */
1657     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1658     U8 *bitmap=NULL;
1659
1660
1661     GET_RE_DEBUG_FLAGS_DECL;
1662
1663     /* We can't just allocate points here. We need to wrap it in
1664     * an SV so it gets freed properly if there is a croak while
1665     * running the match */
1666     ENTER;
1667     SAVETMPS;
1668     sv_points=newSV(maxlen * sizeof(U8 *));
1669     SvCUR_set(sv_points,
1670      maxlen * sizeof(U8 *));
1671     SvPOK_on(sv_points);
1672     sv_2mortal(sv_points);
1673     points=(U8**)SvPV_nolen(sv_points );
1674     if ( trie_type != trie_utf8_fold
1675      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1676     {
1677      if (trie->bitmap)
1678       bitmap=(U8*)trie->bitmap;
1679      else
1680       bitmap=(U8*)ANYOF_BITMAP(c);
1681     }
1682     /* this is the Aho-Corasick algorithm modified a touch
1683     to include special handling for long "unknown char"
1684     sequences. The basic idea being that we use AC as long
1685     as we are dealing with a possible matching char, when
1686     we encounter an unknown char (and we have not encountered
1687     an accepting state) we scan forward until we find a legal
1688     starting char.
1689     AC matching is basically that of trie matching, except
1690     that when we encounter a failing transition, we fall back
1691     to the current states "fail state", and try the current char
1692     again, a process we repeat until we reach the root state,
1693     state 1, or a legal transition. If we fail on the root state
1694     then we can either terminate if we have reached an accepting
1695     state previously, or restart the entire process from the beginning
1696     if we have not.
1697
1698     */
1699     while (s <= last_start) {
1700      const U32 uniflags = UTF8_ALLOW_DEFAULT;
1701      U8 *uc = (U8*)s;
1702      U16 charid = 0;
1703      U32 base = 1;
1704      U32 state = 1;
1705      UV uvc = 0;
1706      STRLEN len = 0;
1707      STRLEN foldlen = 0;
1708      U8 *uscan = (U8*)NULL;
1709      U8 *leftmost = NULL;
1710 #ifdef DEBUGGING
1711      U32 accepted_word= 0;
1712 #endif
1713      U32 pointpos = 0;
1714
1715      while ( state && uc <= (U8*)strend ) {
1716       int failed=0;
1717       U32 word = aho->states[ state ].wordnum;
1718
1719       if( state==1 ) {
1720        if ( bitmap ) {
1721         DEBUG_TRIE_EXECUTE_r(
1722          if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1723           dump_exec_pos( (char *)uc, c, strend, real_start,
1724            (char *)uc, do_utf8 );
1725           PerlIO_printf( Perl_debug_log,
1726            " Scanning for legal start char...\n");
1727          }
1728         );
1729         while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1730          uc++;
1731         }
1732         s= (char *)uc;
1733        }
1734        if (uc >(U8*)last_start) break;
1735       }
1736
1737       if ( word ) {
1738        U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1739        if (!leftmost || lpos < leftmost) {
1740         DEBUG_r(accepted_word=word);
1741         leftmost= lpos;
1742        }
1743        if (base==0) break;
1744
1745       }
1746       points[pointpos++ % maxlen]= uc;
1747       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1748            uscan, len, uvc, charid, foldlen,
1749            foldbuf, uniflags);
1750       DEBUG_TRIE_EXECUTE_r({
1751        dump_exec_pos( (char *)uc, c, strend, real_start,
1752         s,   do_utf8 );
1753        PerlIO_printf(Perl_debug_log,
1754         " Charid:%3u CP:%4"UVxf" ",
1755         charid, uvc);
1756       });
1757
1758       do {
1759 #ifdef DEBUGGING
1760        word = aho->states[ state ].wordnum;
1761 #endif
1762        base = aho->states[ state ].trans.base;
1763
1764        DEBUG_TRIE_EXECUTE_r({
1765         if (failed)
1766          dump_exec_pos( (char *)uc, c, strend, real_start,
1767           s,   do_utf8 );
1768         PerlIO_printf( Perl_debug_log,
1769          "%sState: %4"UVxf", word=%"UVxf,
1770          failed ? " Fail transition to " : "",
1771          (UV)state, (UV)word);
1772        });
1773        if ( base ) {
1774         U32 tmp;
1775         if (charid &&
1776          (base + charid > trie->uniquecharcount )
1777          && (base + charid - 1 - trie->uniquecharcount
1778            < trie->lasttrans)
1779          && trie->trans[base + charid - 1 -
1780            trie->uniquecharcount].check == state
1781          && (tmp=trie->trans[base + charid - 1 -
1782           trie->uniquecharcount ].next))
1783         {
1784          DEBUG_TRIE_EXECUTE_r(
1785           PerlIO_printf( Perl_debug_log," - legal\n"));
1786          state = tmp;
1787          break;
1788         }
1789         else {
1790          DEBUG_TRIE_EXECUTE_r(
1791           PerlIO_printf( Perl_debug_log," - fail\n"));
1792          failed = 1;
1793          state = aho->fail[state];
1794         }
1795        }
1796        else {
1797         /* we must be accepting here */
1798         DEBUG_TRIE_EXECUTE_r(
1799           PerlIO_printf( Perl_debug_log," - accepting\n"));
1800         failed = 1;
1801         break;
1802        }
1803       } while(state);
1804       uc += len;
1805       if (failed) {
1806        if (leftmost)
1807         break;
1808        if (!state) state = 1;
1809       }
1810      }
1811      if ( aho->states[ state ].wordnum ) {
1812       U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1813       if (!leftmost || lpos < leftmost) {
1814        DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1815        leftmost = lpos;
1816       }
1817      }
1818      if (leftmost) {
1819       s = (char*)leftmost;
1820       DEBUG_TRIE_EXECUTE_r({
1821        PerlIO_printf(
1822         Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1823         (UV)accepted_word, (IV)(s - real_start)
1824        );
1825       });
1826       if (!reginfo || regtry(reginfo, &s)) {
1827        FREETMPS;
1828        LEAVE;
1829        goto got_it;
1830       }
1831       s = HOPc(s,1);
1832       DEBUG_TRIE_EXECUTE_r({
1833        PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1834       });
1835      } else {
1836       DEBUG_TRIE_EXECUTE_r(
1837        PerlIO_printf( Perl_debug_log,"No match.\n"));
1838       break;
1839      }
1840     }
1841     FREETMPS;
1842     LEAVE;
1843    }
1844    break;
1845   default:
1846    Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1847    break;
1848   }
1849   return 0;
1850  got_it:
1851   return s;
1852 }
1853
1854
1855 /*
1856  - regexec_flags - match a regexp against a string
1857  */
1858 I32
1859 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1860    char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1861 /* strend: pointer to null at end of string */
1862 /* strbeg: real beginning of string */
1863 /* minend: end of match must be >=minend after stringarg. */
1864 /* data: May be used for some additional optimizations.
1865   Currently its only used, with a U32 cast, for transmitting
1866   the ganch offset when doing a /g match. This will change */
1867 /* nosave: For optimizations. */
1868 {
1869  dVAR;
1870  struct regexp *const prog = (struct regexp *)SvANY(rx);
1871  /*register*/ char *s;
1872  register regnode *c;
1873  /*register*/ char *startpos = stringarg;
1874  I32 minlen;  /* must match at least this many chars */
1875  I32 dontbother = 0; /* how many characters not to try at end */
1876  I32 end_shift = 0;   /* Same for the end. */  /* CC */
1877  I32 scream_pos = -1;  /* Internal iterator of scream. */
1878  char *scream_olds = NULL;
1879  const bool do_utf8 = (bool)DO_UTF8(sv);
1880  I32 multiline;
1881  RXi_GET_DECL(prog,progi);
1882  regmatch_info reginfo;  /* create some info to pass to regtry etc */
1883  regexp_paren_pair *swap = NULL;
1884  GET_RE_DEBUG_FLAGS_DECL;
1885
1886  PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1887  PERL_UNUSED_ARG(data);
1888
1889  /* Be paranoid... */
1890  if (prog == NULL || startpos == NULL) {
1891   Perl_croak(aTHX_ "NULL regexp parameter");
1892   return 0;
1893  }
1894
1895  multiline = prog->extflags & RXf_PMf_MULTILINE;
1896  reginfo.prog = rx;  /* Yes, sorry that this is confusing.  */
1897
1898  RX_MATCH_UTF8_set(rx, do_utf8);
1899  DEBUG_EXECUTE_r(
1900   debug_start_match(rx, do_utf8, startpos, strend,
1901   "Matching");
1902  );
1903
1904  minlen = prog->minlen;
1905
1906  if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1907   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1908        "String too short [regexec_flags]...\n"));
1909   goto phooey;
1910  }
1911
1912
1913  /* Check validity of program. */
1914  if (UCHARAT(progi->program) != REG_MAGIC) {
1915   Perl_croak(aTHX_ "corrupted regexp program");
1916  }
1917
1918  PL_reg_flags = 0;
1919  PL_reg_eval_set = 0;
1920  PL_reg_maxiter = 0;
1921
1922  if (RX_UTF8(rx))
1923   PL_reg_flags |= RF_utf8;
1924
1925  /* Mark beginning of line for ^ and lookbehind. */
1926  reginfo.bol = startpos; /* XXX not used ??? */
1927  PL_bostr  = strbeg;
1928  reginfo.sv = sv;
1929
1930  /* Mark end of line for $ (and such) */
1931  PL_regeol = strend;
1932
1933  /* see how far we have to get to not match where we matched before */
1934  reginfo.till = startpos+minend;
1935
1936  /* If there is a "must appear" string, look for it. */
1937  s = startpos;
1938
1939  if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1940   MAGIC *mg;
1941   if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
1942    reginfo.ganch = startpos + prog->gofs;
1943    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1944    "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1945   } else if (sv && SvTYPE(sv) >= SVt_PVMG
1946     && SvMAGIC(sv)
1947     && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1948     && mg->mg_len >= 0) {
1949    reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1950    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1951     "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
1952
1953    if (prog->extflags & RXf_ANCH_GPOS) {
1954     if (s > reginfo.ganch)
1955      goto phooey;
1956     s = reginfo.ganch - prog->gofs;
1957     DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1958      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
1959     if (s < strbeg)
1960      goto phooey;
1961    }
1962   }
1963   else if (data) {
1964    reginfo.ganch = strbeg + PTR2UV(data);
1965    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1966     "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1967
1968   } else {    /* pos() not defined */
1969    reginfo.ganch = strbeg;
1970    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1971     "GPOS: reginfo.ganch = strbeg\n"));
1972   }
1973  }
1974  if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1975   /* We have to be careful. If the previous successful match
1976   was from this regex we don't want a subsequent partially
1977   successful match to clobber the old results.
1978   So when we detect this possibility we add a swap buffer
1979   to the re, and switch the buffer each match. If we fail
1980   we switch it back, otherwise we leave it swapped.
1981   */
1982   swap = prog->offs;
1983   /* do we need a save destructor here for eval dies? */
1984   Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
1985  }
1986  if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1987   re_scream_pos_data d;
1988
1989   d.scream_olds = &scream_olds;
1990   d.scream_pos = &scream_pos;
1991   s = re_intuit_start(rx, sv, s, strend, flags, &d);
1992   if (!s) {
1993    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1994    goto phooey; /* not present */
1995   }
1996  }
1997
1998
1999
2000  /* Simplest case:  anchored match need be tried only once. */
2001  /*  [unless only anchor is BOL and multiline is set] */
2002  if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2003   if (s == startpos && regtry(&reginfo, &startpos))
2004    goto got_it;
2005   else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2006     || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2007   {
2008    char *end;
2009
2010    if (minlen)
2011     dontbother = minlen - 1;
2012    end = HOP3c(strend, -dontbother, strbeg) - 1;
2013    /* for multiline we only have to try after newlines */
2014    if (prog->check_substr || prog->check_utf8) {
2015     if (s == startpos)
2016      goto after_try;
2017     while (1) {
2018      if (regtry(&reginfo, &s))
2019       goto got_it;
2020     after_try:
2021      if (s > end)
2022       goto phooey;
2023      if (prog->extflags & RXf_USE_INTUIT) {
2024       s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2025       if (!s)
2026        goto phooey;
2027      }
2028      else
2029       s++;
2030     }
2031    } else {
2032     if (s > startpos)
2033      s--;
2034     while (s < end) {
2035      if (*s++ == '\n') { /* don't need PL_utf8skip here */
2036       if (regtry(&reginfo, &s))
2037        goto got_it;
2038      }
2039     }
2040    }
2041   }
2042   goto phooey;
2043  } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2044  {
2045   /* the warning about reginfo.ganch being used without intialization
2046   is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2047   and we only enter this block when the same bit is set. */
2048   char *tmp_s = reginfo.ganch - prog->gofs;
2049
2050   if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2051    goto got_it;
2052   goto phooey;
2053  }
2054
2055  /* Messy cases:  unanchored match. */
2056  if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2057   /* we have /x+whatever/ */
2058   /* it must be a one character string (XXXX Except UTF?) */
2059   char ch;
2060 #ifdef DEBUGGING
2061   int did_match = 0;
2062 #endif
2063   if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2064    do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2065   ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
2066
2067   if (do_utf8) {
2068    REXEC_FBC_SCAN(
2069     if (*s == ch) {
2070      DEBUG_EXECUTE_r( did_match = 1 );
2071      if (regtry(&reginfo, &s)) goto got_it;
2072      s += UTF8SKIP(s);
2073      while (s < strend && *s == ch)
2074       s += UTF8SKIP(s);
2075     }
2076    );
2077   }
2078   else {
2079    REXEC_FBC_SCAN(
2080     if (*s == ch) {
2081      DEBUG_EXECUTE_r( did_match = 1 );
2082      if (regtry(&reginfo, &s)) goto got_it;
2083      s++;
2084      while (s < strend && *s == ch)
2085       s++;
2086     }
2087    );
2088   }
2089   DEBUG_EXECUTE_r(if (!did_match)
2090     PerlIO_printf(Perl_debug_log,
2091         "Did not find anchored character...\n")
2092    );
2093  }
2094  else if (prog->anchored_substr != NULL
2095    || prog->anchored_utf8 != NULL
2096    || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2097     && prog->float_max_offset < strend - s)) {
2098   SV *must;
2099   I32 back_max;
2100   I32 back_min;
2101   char *last;
2102   char *last1;  /* Last position checked before */
2103 #ifdef DEBUGGING
2104   int did_match = 0;
2105 #endif
2106   if (prog->anchored_substr || prog->anchored_utf8) {
2107    if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2108     do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2109    must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2110    back_max = back_min = prog->anchored_offset;
2111   } else {
2112    if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2113     do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2114    must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2115    back_max = prog->float_max_offset;
2116    back_min = prog->float_min_offset;
2117   }
2118
2119
2120   if (must == &PL_sv_undef)
2121    /* could not downgrade utf8 check substring, so must fail */
2122    goto phooey;
2123
2124   if (back_min<0) {
2125    last = strend;
2126   } else {
2127    last = HOP3c(strend, /* Cannot start after this */
2128     -(I32)(CHR_SVLEN(must)
2129       - (SvTAIL(must) != 0) + back_min), strbeg);
2130   }
2131   if (s > PL_bostr)
2132    last1 = HOPc(s, -1);
2133   else
2134    last1 = s - 1; /* bogus */
2135
2136   /* XXXX check_substr already used to find "s", can optimize if
2137   check_substr==must. */
2138   scream_pos = -1;
2139   dontbother = end_shift;
2140   strend = HOPc(strend, -dontbother);
2141   while ( (s <= last) &&
2142     ((flags & REXEC_SCREAM)
2143     ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2144          end_shift, &scream_pos, 0))
2145     : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2146         (unsigned char*)strend, must,
2147         multiline ? FBMrf_MULTILINE : 0))) ) {
2148    /* we may be pointing at the wrong string */
2149    if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2150     s = strbeg + (s - SvPVX_const(sv));
2151    DEBUG_EXECUTE_r( did_match = 1 );
2152    if (HOPc(s, -back_max) > last1) {
2153     last1 = HOPc(s, -back_min);
2154     s = HOPc(s, -back_max);
2155    }
2156    else {
2157     char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2158
2159     last1 = HOPc(s, -back_min);
2160     s = t;
2161    }
2162    if (do_utf8) {
2163     while (s <= last1) {
2164      if (regtry(&reginfo, &s))
2165       goto got_it;
2166      s += UTF8SKIP(s);
2167     }
2168    }
2169    else {
2170     while (s <= last1) {
2171      if (regtry(&reginfo, &s))
2172       goto got_it;
2173      s++;
2174     }
2175    }
2176   }
2177   DEBUG_EXECUTE_r(if (!did_match) {
2178    RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2179     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2180    PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2181        ((must == prog->anchored_substr || must == prog->anchored_utf8)
2182        ? "anchored" : "floating"),
2183     quoted, RE_SV_TAIL(must));
2184   });
2185   goto phooey;
2186  }
2187  else if ( (c = progi->regstclass) ) {
2188   if (minlen) {
2189    const OPCODE op = OP(progi->regstclass);
2190    /* don't bother with what can't match */
2191    if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2192     strend = HOPc(strend, -(minlen - 1));
2193   }
2194   DEBUG_EXECUTE_r({
2195    SV * const prop = sv_newmortal();
2196    regprop(prog, prop, c);
2197    {
2198     RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2199      s,strend-s,60);
2200     PerlIO_printf(Perl_debug_log,
2201      "Matching stclass %.*s against %s (%d chars)\n",
2202      (int)SvCUR(prop), SvPVX_const(prop),
2203      quoted, (int)(strend - s));
2204    }
2205   });
2206   if (find_byclass(prog, c, s, strend, &reginfo))
2207    goto got_it;
2208   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2209  }
2210  else {
2211   dontbother = 0;
2212   if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2213    /* Trim the end. */
2214    char *last;
2215    SV* float_real;
2216
2217    if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2218     do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2219    float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2220
2221    if (flags & REXEC_SCREAM) {
2222     last = screaminstr(sv, float_real, s - strbeg,
2223         end_shift, &scream_pos, 1); /* last one */
2224     if (!last)
2225      last = scream_olds; /* Only one occurrence. */
2226     /* we may be pointing at the wrong string */
2227     else if (RXp_MATCH_COPIED(prog))
2228      s = strbeg + (s - SvPVX_const(sv));
2229    }
2230    else {
2231     STRLEN len;
2232     const char * const little = SvPV_const(float_real, len);
2233
2234     if (SvTAIL(float_real)) {
2235      if (memEQ(strend - len + 1, little, len - 1))
2236       last = strend - len + 1;
2237      else if (!multiline)
2238       last = memEQ(strend - len, little, len)
2239        ? strend - len : NULL;
2240      else
2241       goto find_last;
2242     } else {
2243     find_last:
2244      if (len)
2245       last = rninstr(s, strend, little, little + len);
2246      else
2247       last = strend; /* matching "$" */
2248     }
2249    }
2250    if (last == NULL) {
2251     DEBUG_EXECUTE_r(
2252      PerlIO_printf(Perl_debug_log,
2253       "%sCan't trim the tail, match fails (should not happen)%s\n",
2254       PL_colors[4], PL_colors[5]));
2255     goto phooey; /* Should not happen! */
2256    }
2257    dontbother = strend - last + prog->float_min_offset;
2258   }
2259   if (minlen && (dontbother < minlen))
2260    dontbother = minlen - 1;
2261   strend -= dontbother;      /* this one's always in bytes! */
2262   /* We don't know much -- general case. */
2263   if (do_utf8) {
2264    for (;;) {
2265     if (regtry(&reginfo, &s))
2266      goto got_it;
2267     if (s >= strend)
2268      break;
2269     s += UTF8SKIP(s);
2270    };
2271   }
2272   else {
2273    do {
2274     if (regtry(&reginfo, &s))
2275      goto got_it;
2276    } while (s++ < strend);
2277   }
2278  }
2279
2280  /* Failure. */
2281  goto phooey;
2282
2283 got_it:
2284  Safefree(swap);
2285  RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2286
2287  if (PL_reg_eval_set)
2288   restore_pos(aTHX_ prog);
2289  if (RXp_PAREN_NAMES(prog))
2290   (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2291
2292  /* make sure $`, $&, $', and $digit will work later */
2293  if ( !(flags & REXEC_NOT_FIRST) ) {
2294   RX_MATCH_COPY_FREE(rx);
2295   if (flags & REXEC_COPY_STR) {
2296    const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2297 #ifdef PERL_OLD_COPY_ON_WRITE
2298    if ((SvIsCOW(sv)
2299     || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2300     if (DEBUG_C_TEST) {
2301      PerlIO_printf(Perl_debug_log,
2302         "Copy on write: regexp capture, type %d\n",
2303         (int) SvTYPE(sv));
2304     }
2305     prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2306     prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2307     assert (SvPOKp(prog->saved_copy));
2308    } else
2309 #endif
2310    {
2311     RX_MATCH_COPIED_on(rx);
2312     s = savepvn(strbeg, i);
2313     prog->subbeg = s;
2314    }
2315    prog->sublen = i;
2316   }
2317   else {
2318    prog->subbeg = strbeg;
2319    prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2320   }
2321  }
2322
2323  return 1;
2324
2325 phooey:
2326  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2327       PL_colors[4], PL_colors[5]));
2328  if (PL_reg_eval_set)
2329   restore_pos(aTHX_ prog);
2330  if (swap) {
2331   /* we failed :-( roll it back */
2332   Safefree(prog->offs);
2333   prog->offs = swap;
2334  }
2335
2336  return 0;
2337 }
2338
2339
2340 /*
2341  - regtry - try match at specific point
2342  */
2343 STATIC I32   /* 0 failure, 1 success */
2344 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2345 {
2346  dVAR;
2347  CHECKPOINT lastcp;
2348  REGEXP *const rx = reginfo->prog;
2349  regexp *const prog = (struct regexp *)SvANY(rx);
2350  RXi_GET_DECL(prog,progi);
2351  GET_RE_DEBUG_FLAGS_DECL;
2352
2353  PERL_ARGS_ASSERT_REGTRY;
2354
2355  reginfo->cutpoint=NULL;
2356
2357  if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2358   MAGIC *mg;
2359
2360   PL_reg_eval_set = RS_init;
2361   DEBUG_EXECUTE_r(DEBUG_s(
2362    PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2363       (IV)(PL_stack_sp - PL_stack_base));
2364    ));
2365   SAVESTACK_CXPOS();
2366   cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2367   /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2368   SAVETMPS;
2369   /* Apparently this is not needed, judging by wantarray. */
2370   /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2371   cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2372
2373   if (reginfo->sv) {
2374    /* Make $_ available to executed code. */
2375    if (reginfo->sv != DEFSV) {
2376     SAVE_DEFSV;
2377     DEFSV_set(reginfo->sv);
2378    }
2379
2380    if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2381     && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2382     /* prepare for quick setting of pos */
2383 #ifdef PERL_OLD_COPY_ON_WRITE
2384     if (SvIsCOW(reginfo->sv))
2385      sv_force_normal_flags(reginfo->sv, 0);
2386 #endif
2387     mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2388         &PL_vtbl_mglob, NULL, 0);
2389     mg->mg_len = -1;
2390    }
2391    PL_reg_magic    = mg;
2392    PL_reg_oldpos   = mg->mg_len;
2393    SAVEDESTRUCTOR_X(restore_pos, prog);
2394   }
2395   if (!PL_reg_curpm) {
2396    Newxz(PL_reg_curpm, 1, PMOP);
2397 #ifdef USE_ITHREADS
2398    {
2399     SV* const repointer = &PL_sv_undef;
2400     /* this regexp is also owned by the new PL_reg_curpm, which
2401     will try to free it.  */
2402     av_push(PL_regex_padav, repointer);
2403     PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2404     PL_regex_pad = AvARRAY(PL_regex_padav);
2405    }
2406 #endif
2407   }
2408 #ifdef USE_ITHREADS
2409   /* It seems that non-ithreads works both with and without this code.
2410   So for efficiency reasons it seems best not to have the code
2411   compiled when it is not needed.  */
2412   /* This is safe against NULLs: */
2413   ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2414   /* PM_reg_curpm owns a reference to this regexp.  */
2415   ReREFCNT_inc(rx);
2416 #endif
2417   PM_SETRE(PL_reg_curpm, rx);
2418   PL_reg_oldcurpm = PL_curpm;
2419   PL_curpm = PL_reg_curpm;
2420   if (RXp_MATCH_COPIED(prog)) {
2421    /*  Here is a serious problem: we cannot rewrite subbeg,
2422     since it may be needed if this match fails.  Thus
2423     $` inside (?{}) could fail... */
2424    PL_reg_oldsaved = prog->subbeg;
2425    PL_reg_oldsavedlen = prog->sublen;
2426 #ifdef PERL_OLD_COPY_ON_WRITE
2427    PL_nrs = prog->saved_copy;
2428 #endif
2429    RXp_MATCH_COPIED_off(prog);
2430   }
2431   else
2432    PL_reg_oldsaved = NULL;
2433   prog->subbeg = PL_bostr;
2434   prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2435  }
2436  DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2437  prog->offs[0].start = *startpos - PL_bostr;
2438  PL_reginput = *startpos;
2439  PL_reglastparen = &prog->lastparen;
2440  PL_reglastcloseparen = &prog->lastcloseparen;
2441  prog->lastparen = 0;
2442  prog->lastcloseparen = 0;
2443  PL_regsize = 0;
2444  PL_regoffs = prog->offs;
2445  if (PL_reg_start_tmpl <= prog->nparens) {
2446   PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2447   if(PL_reg_start_tmp)
2448    Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2449   else
2450    Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2451  }
2452
2453  /* XXXX What this code is doing here?!!!  There should be no need
2454  to do this again and again, PL_reglastparen should take care of
2455  this!  --ilya*/
2456
2457  /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2458  * Actually, the code in regcppop() (which Ilya may be meaning by
2459  * PL_reglastparen), is not needed at all by the test suite
2460  * (op/regexp, op/pat, op/split), but that code is needed otherwise
2461  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2462  * Meanwhile, this code *is* needed for the
2463  * above-mentioned test suite tests to succeed.  The common theme
2464  * on those tests seems to be returning null fields from matches.
2465  * --jhi updated by dapm */
2466 #if 1
2467  if (prog->nparens) {
2468   regexp_paren_pair *pp = PL_regoffs;
2469   register I32 i;
2470   for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2471    ++pp;
2472    pp->start = -1;
2473    pp->end = -1;
2474   }
2475  }
2476 #endif
2477  REGCP_SET(lastcp);
2478  if (regmatch(reginfo, progi->program + 1)) {
2479   PL_regoffs[0].end = PL_reginput - PL_bostr;
2480   return 1;
2481  }
2482  if (reginfo->cutpoint)
2483   *startpos= reginfo->cutpoint;
2484  REGCP_UNWIND(lastcp);
2485  return 0;
2486 }
2487
2488
2489 #define sayYES goto yes
2490 #define sayNO goto no
2491 #define sayNO_SILENT goto no_silent
2492
2493 /* we dont use STMT_START/END here because it leads to
2494    "unreachable code" warnings, which are bogus, but distracting. */
2495 #define CACHEsayNO \
2496  if (ST.cache_mask) \
2497  PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2498  sayNO
2499
2500 /* this is used to determine how far from the left messages like
2501    'failed...' are printed. It should be set such that messages
2502    are inline with the regop output that created them.
2503 */
2504 #define REPORT_CODE_OFF 32
2505
2506
2507 /* Make sure there is a test for this +1 options in re_tests */
2508 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2509
2510 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2511 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2512
2513 #define SLAB_FIRST(s) (&(s)->states[0])
2514 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2515
2516 /* grab a new slab and return the first slot in it */
2517
2518 STATIC regmatch_state *
2519 S_push_slab(pTHX)
2520 {
2521 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2522  dMY_CXT;
2523 #endif
2524  regmatch_slab *s = PL_regmatch_slab->next;
2525  if (!s) {
2526   Newx(s, 1, regmatch_slab);
2527   s->prev = PL_regmatch_slab;
2528   s->next = NULL;
2529   PL_regmatch_slab->next = s;
2530  }
2531  PL_regmatch_slab = s;
2532  return SLAB_FIRST(s);
2533 }
2534
2535
2536 /* push a new state then goto it */
2537
2538 #define PUSH_STATE_GOTO(state, node) \
2539  scan = node; \
2540  st->resume_state = state; \
2541  goto push_state;
2542
2543 /* push a new state with success backtracking, then goto it */
2544
2545 #define PUSH_YES_STATE_GOTO(state, node) \
2546  scan = node; \
2547  st->resume_state = state; \
2548  goto push_yes_state;
2549
2550
2551
2552 /*
2553
2554 regmatch() - main matching routine
2555
2556 This is basically one big switch statement in a loop. We execute an op,
2557 set 'next' to point the next op, and continue. If we come to a point which
2558 we may need to backtrack to on failure such as (A|B|C), we push a
2559 backtrack state onto the backtrack stack. On failure, we pop the top
2560 state, and re-enter the loop at the state indicated. If there are no more
2561 states to pop, we return failure.
2562
2563 Sometimes we also need to backtrack on success; for example /A+/, where
2564 after successfully matching one A, we need to go back and try to
2565 match another one; similarly for lookahead assertions: if the assertion
2566 completes successfully, we backtrack to the state just before the assertion
2567 and then carry on.  In these cases, the pushed state is marked as
2568 'backtrack on success too'. This marking is in fact done by a chain of
2569 pointers, each pointing to the previous 'yes' state. On success, we pop to
2570 the nearest yes state, discarding any intermediate failure-only states.
2571 Sometimes a yes state is pushed just to force some cleanup code to be
2572 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2573 it to free the inner regex.
2574
2575 Note that failure backtracking rewinds the cursor position, while
2576 success backtracking leaves it alone.
2577
2578 A pattern is complete when the END op is executed, while a subpattern
2579 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2580 ops trigger the "pop to last yes state if any, otherwise return true"
2581 behaviour.
2582
2583 A common convention in this function is to use A and B to refer to the two
2584 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2585 the subpattern to be matched possibly multiple times, while B is the entire
2586 rest of the pattern. Variable and state names reflect this convention.
2587
2588 The states in the main switch are the union of ops and failure/success of
2589 substates associated with with that op.  For example, IFMATCH is the op
2590 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2591 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2592 successfully matched A and IFMATCH_A_fail is a state saying that we have
2593 just failed to match A. Resume states always come in pairs. The backtrack
2594 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2595 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2596 on success or failure.
2597
2598 The struct that holds a backtracking state is actually a big union, with
2599 one variant for each major type of op. The variable st points to the
2600 top-most backtrack struct. To make the code clearer, within each
2601 block of code we #define ST to alias the relevant union.
2602
2603 Here's a concrete example of a (vastly oversimplified) IFMATCH
2604 implementation:
2605
2606  switch (state) {
2607  ....
2608
2609 #define ST st->u.ifmatch
2610
2611  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2612   ST.foo = ...; // some state we wish to save
2613   ...
2614   // push a yes backtrack state with a resume value of
2615   // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2616   // first node of A:
2617   PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2618   // NOTREACHED
2619
2620  case IFMATCH_A: // we have successfully executed A; now continue with B
2621   next = B;
2622   bar = ST.foo; // do something with the preserved value
2623   break;
2624
2625  case IFMATCH_A_fail: // A failed, so the assertion failed
2626   ...;   // do some housekeeping, then ...
2627   sayNO; // propagate the failure
2628
2629 #undef ST
2630
2631  ...
2632  }
2633
2634 For any old-timers reading this who are familiar with the old recursive
2635 approach, the code above is equivalent to:
2636
2637  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2638  {
2639   int foo = ...
2640   ...
2641   if (regmatch(A)) {
2642    next = B;
2643    bar = foo;
2644    break;
2645   }
2646   ...;   // do some housekeeping, then ...
2647   sayNO; // propagate the failure
2648  }
2649
2650 The topmost backtrack state, pointed to by st, is usually free. If you
2651 want to claim it, populate any ST.foo fields in it with values you wish to
2652 save, then do one of
2653
2654   PUSH_STATE_GOTO(resume_state, node);
2655   PUSH_YES_STATE_GOTO(resume_state, node);
2656
2657 which sets that backtrack state's resume value to 'resume_state', pushes a
2658 new free entry to the top of the backtrack stack, then goes to 'node'.
2659 On backtracking, the free slot is popped, and the saved state becomes the
2660 new free state. An ST.foo field in this new top state can be temporarily
2661 accessed to retrieve values, but once the main loop is re-entered, it
2662 becomes available for reuse.
2663
2664 Note that the depth of the backtrack stack constantly increases during the
2665 left-to-right execution of the pattern, rather than going up and down with
2666 the pattern nesting. For example the stack is at its maximum at Z at the
2667 end of the pattern, rather than at X in the following:
2668
2669  /(((X)+)+)+....(Y)+....Z/
2670
2671 The only exceptions to this are lookahead/behind assertions and the cut,
2672 (?>A), which pop all the backtrack states associated with A before
2673 continuing.
2674
2675 Bascktrack state structs are allocated in slabs of about 4K in size.
2676 PL_regmatch_state and st always point to the currently active state,
2677 and PL_regmatch_slab points to the slab currently containing
2678 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2679 allocated, and is never freed until interpreter destruction. When the slab
2680 is full, a new one is allocated and chained to the end. At exit from
2681 regmatch(), slabs allocated since entry are freed.
2682
2683 */
2684
2685
2686 #define DEBUG_STATE_pp(pp)        \
2687  DEBUG_STATE_r({         \
2688   DUMP_EXEC_POS(locinput, scan, do_utf8);      \
2689   PerlIO_printf(Perl_debug_log,       \
2690    "    %*s"pp" %s%s%s%s%s\n",       \
2691    depth*2, "",        \
2692    PL_reg_name[st->resume_state],                     \
2693    ((st==yes_state||st==mark_state) ? "[" : ""),   \
2694    ((st==yes_state) ? "Y" : ""),                   \
2695    ((st==mark_state) ? "M" : ""),                  \
2696    ((st==yes_state||st==mark_state) ? "]" : "")    \
2697   );                                                  \
2698  });
2699
2700
2701 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2702
2703 #ifdef DEBUGGING
2704
2705 STATIC void
2706 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2707  const char *start, const char *end, const char *blurb)
2708 {
2709  const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2710
2711  PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2712
2713  if (!PL_colorset)
2714    reginitcolors();
2715  {
2716   RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2717    RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2718
2719   RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2720    start, end - start, 60);
2721
2722   PerlIO_printf(Perl_debug_log,
2723    "%s%s REx%s %s against %s\n",
2724      PL_colors[4], blurb, PL_colors[5], s0, s1);
2725
2726   if (do_utf8||utf8_pat)
2727    PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2728     utf8_pat ? "pattern" : "",
2729     utf8_pat && do_utf8 ? " and " : "",
2730     do_utf8 ? "string" : ""
2731    );
2732  }
2733 }
2734
2735 STATIC void
2736 S_dump_exec_pos(pTHX_ const char *locinput,
2737      const regnode *scan,
2738      const char *loc_regeol,
2739      const char *loc_bostr,
2740      const char *loc_reg_starttry,
2741      const bool do_utf8)
2742 {
2743  const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2744  const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2745  int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2746  /* The part of the string before starttry has one color
2747  (pref0_len chars), between starttry and current
2748  position another one (pref_len - pref0_len chars),
2749  after the current position the third one.
2750  We assume that pref0_len <= pref_len, otherwise we
2751  decrease pref0_len.  */
2752  int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2753   ? (5 + taill) - l : locinput - loc_bostr;
2754  int pref0_len;
2755
2756  PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2757
2758  while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2759   pref_len++;
2760  pref0_len = pref_len  - (locinput - loc_reg_starttry);
2761  if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2762   l = ( loc_regeol - locinput > (5 + taill) - pref_len
2763    ? (5 + taill) - pref_len : loc_regeol - locinput);
2764  while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2765   l--;
2766  if (pref0_len < 0)
2767   pref0_len = 0;
2768  if (pref0_len > pref_len)
2769   pref0_len = pref_len;
2770  {
2771   const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2772
2773   RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2774    (locinput - pref_len),pref0_len, 60, 4, 5);
2775
2776   RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2777      (locinput - pref_len + pref0_len),
2778      pref_len - pref0_len, 60, 2, 3);
2779
2780   RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2781      locinput, loc_regeol - locinput, 10, 0, 1);
2782
2783   const STRLEN tlen=len0+len1+len2;
2784   PerlIO_printf(Perl_debug_log,
2785      "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2786      (IV)(locinput - loc_bostr),
2787      len0, s0,
2788      len1, s1,
2789      (docolor ? "" : "> <"),
2790      len2, s2,
2791      (int)(tlen > 19 ? 0 :  19 - tlen),
2792      "");
2793  }
2794 }
2795
2796 #endif
2797
2798 /* reg_check_named_buff_matched()
2799  * Checks to see if a named buffer has matched. The data array of
2800  * buffer numbers corresponding to the buffer is expected to reside
2801  * in the regexp->data->data array in the slot stored in the ARG() of
2802  * node involved. Note that this routine doesn't actually care about the
2803  * name, that information is not preserved from compilation to execution.
2804  * Returns the index of the leftmost defined buffer with the given name
2805  * or 0 if non of the buffers matched.
2806  */
2807 STATIC I32
2808 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2809 {
2810  I32 n;
2811  RXi_GET_DECL(rex,rexi);
2812  SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2813  I32 *nums=(I32*)SvPVX(sv_dat);
2814
2815  PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2816
2817  for ( n=0; n<SvIVX(sv_dat); n++ ) {
2818   if ((I32)*PL_reglastparen >= nums[n] &&
2819    PL_regoffs[nums[n]].end != -1)
2820   {
2821    return nums[n];
2822   }
2823  }
2824  return 0;
2825 }
2826
2827
2828 /* free all slabs above current one  - called during LEAVE_SCOPE */
2829
2830 STATIC void
2831 S_clear_backtrack_stack(pTHX_ void *p)
2832 {
2833  regmatch_slab *s = PL_regmatch_slab->next;
2834  PERL_UNUSED_ARG(p);
2835
2836  if (!s)
2837   return;
2838  PL_regmatch_slab->next = NULL;
2839  while (s) {
2840   regmatch_slab * const osl = s;
2841   s = s->next;
2842   Safefree(osl);
2843  }
2844 }
2845
2846
2847 #define SETREX(Re1,Re2) \
2848  if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2849  Re1 = (Re2)
2850
2851 STATIC I32   /* 0 failure, 1 success */
2852 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2853 {
2854 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2855  dMY_CXT;
2856 #endif
2857  dVAR;
2858  register const bool do_utf8 = PL_reg_match_utf8;
2859  const U32 uniflags = UTF8_ALLOW_DEFAULT;
2860  REGEXP *rex_sv = reginfo->prog;
2861  regexp *rex = (struct regexp *)SvANY(rex_sv);
2862  RXi_GET_DECL(rex,rexi);
2863  I32 oldsave;
2864  /* the current state. This is a cached copy of PL_regmatch_state */
2865  register regmatch_state *st;
2866  /* cache heavy used fields of st in registers */
2867  register regnode *scan;
2868  register regnode *next;
2869  register U32 n = 0; /* general value; init to avoid compiler warning */
2870  register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2871  register char *locinput = PL_reginput;
2872  register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2873
2874  bool result = 0;     /* return value of S_regmatch */
2875  int depth = 0;     /* depth of backtrack stack */
2876  U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2877  const U32 max_nochange_depth =
2878   (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2879   3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2880  regmatch_state *yes_state = NULL; /* state to pop to on success of
2881                subpattern */
2882  /* mark_state piggy backs on the yes_state logic so that when we unwind
2883  the stack on success we can update the mark_state as we go */
2884  regmatch_state *mark_state = NULL; /* last mark state we have seen */
2885  regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2886  struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2887  U32 state_num;
2888  bool no_final = 0;      /* prevent failure from backtracking? */
2889  bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2890  char *startpoint = PL_reginput;
2891  SV *popmark = NULL;     /* are we looking for a mark? */
2892  SV *sv_commit = NULL;   /* last mark name seen in failure */
2893  SV *sv_yes_mark = NULL; /* last mark name we have seen
2894        during a successfull match */
2895  U32 lastopen = 0;       /* last open we saw */
2896  bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2897  SV* const oreplsv = GvSV(PL_replgv);
2898  /* these three flags are set by various ops to signal information to
2899  * the very next op. They have a useful lifetime of exactly one loop
2900  * iteration, and are not preserved or restored by state pushes/pops
2901  */
2902  bool sw = 0;     /* the condition value in (?(cond)a|b) */
2903  bool minmod = 0;     /* the next "{n,m}" is a "{n,m}?" */
2904  int logical = 0;     /* the following EVAL is:
2905         0: (?{...})
2906         1: (?(?{...})X|Y)
2907         2: (??{...})
2908        or the following IFMATCH/UNLESSM is:
2909         false: plain (?=foo)
2910         true:  used as a condition: (?(?=foo))
2911        */
2912 #ifdef DEBUGGING
2913  GET_RE_DEBUG_FLAGS_DECL;
2914 #endif
2915
2916  PERL_ARGS_ASSERT_REGMATCH;
2917
2918  DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2919    PerlIO_printf(Perl_debug_log,"regmatch start\n");
2920  }));
2921  /* on first ever call to regmatch, allocate first slab */
2922  if (!PL_regmatch_slab) {
2923   Newx(PL_regmatch_slab, 1, regmatch_slab);
2924   PL_regmatch_slab->prev = NULL;
2925   PL_regmatch_slab->next = NULL;
2926   PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2927  }
2928
2929  oldsave = PL_savestack_ix;
2930  SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2931  SAVEVPTR(PL_regmatch_slab);
2932  SAVEVPTR(PL_regmatch_state);
2933
2934  /* grab next free state slot */
2935  st = ++PL_regmatch_state;
2936  if (st >  SLAB_LAST(PL_regmatch_slab))
2937   st = PL_regmatch_state = S_push_slab(aTHX);
2938
2939  /* Note that nextchr is a byte even in UTF */
2940  nextchr = UCHARAT(locinput);
2941  scan = prog;
2942  while (scan != NULL) {
2943
2944   DEBUG_EXECUTE_r( {
2945    SV * const prop = sv_newmortal();
2946    regnode *rnext=regnext(scan);
2947    DUMP_EXEC_POS( locinput, scan, do_utf8 );
2948    regprop(rex, prop, scan);
2949
2950    PerlIO_printf(Perl_debug_log,
2951      "%3"IVdf":%*s%s(%"IVdf")\n",
2952      (IV)(scan - rexi->program), depth*2, "",
2953      SvPVX_const(prop),
2954      (PL_regkind[OP(scan)] == END || !rnext) ?
2955       0 : (IV)(rnext - rexi->program));
2956   });
2957
2958   next = scan + NEXT_OFF(scan);
2959   if (next == scan)
2960    next = NULL;
2961   state_num = OP(scan);
2962
2963   REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
2964  reenter_switch:
2965
2966   assert(PL_reglastparen == &rex->lastparen);
2967   assert(PL_reglastcloseparen == &rex->lastcloseparen);
2968   assert(PL_regoffs == rex->offs);
2969
2970   switch (state_num) {
2971   case BOL:
2972    if (locinput == PL_bostr)
2973    {
2974     /* reginfo->till = reginfo->bol; */
2975     break;
2976    }
2977    sayNO;
2978   case MBOL:
2979    if (locinput == PL_bostr ||
2980     ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2981    {
2982     break;
2983    }
2984    sayNO;
2985   case SBOL:
2986    if (locinput == PL_bostr)
2987     break;
2988    sayNO;
2989   case GPOS:
2990    if (locinput == reginfo->ganch)
2991     break;
2992    sayNO;
2993
2994   case KEEPS:
2995    /* update the startpoint */
2996    st->u.keeper.val = PL_regoffs[0].start;
2997    PL_reginput = locinput;
2998    PL_regoffs[0].start = locinput - PL_bostr;
2999    PUSH_STATE_GOTO(KEEPS_next, next);
3000    /*NOT-REACHED*/
3001   case KEEPS_next_fail:
3002    /* rollback the start point change */
3003    PL_regoffs[0].start = st->u.keeper.val;
3004    sayNO_SILENT;
3005    /*NOT-REACHED*/
3006   case EOL:
3007     goto seol;
3008   case MEOL:
3009    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3010     sayNO;
3011    break;
3012   case SEOL:
3013   seol:
3014    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3015     sayNO;
3016    if (PL_regeol - locinput > 1)
3017     sayNO;
3018    break;
3019   case EOS:
3020    if (PL_regeol != locinput)
3021     sayNO;
3022    break;
3023   case SANY:
3024    if (!nextchr && locinput >= PL_regeol)
3025     sayNO;
3026    if (do_utf8) {
3027     locinput += PL_utf8skip[nextchr];
3028     if (locinput > PL_regeol)
3029      sayNO;
3030     nextchr = UCHARAT(locinput);
3031    }
3032    else
3033     nextchr = UCHARAT(++locinput);
3034    break;
3035   case CANY:
3036    if (!nextchr && locinput >= PL_regeol)
3037     sayNO;
3038    nextchr = UCHARAT(++locinput);
3039    break;
3040   case REG_ANY:
3041    if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3042     sayNO;
3043    if (do_utf8) {
3044     locinput += PL_utf8skip[nextchr];
3045     if (locinput > PL_regeol)
3046      sayNO;
3047     nextchr = UCHARAT(locinput);
3048    }
3049    else
3050     nextchr = UCHARAT(++locinput);
3051    break;
3052
3053 #undef  ST
3054 #define ST st->u.trie
3055   case TRIEC:
3056    /* In this case the charclass data is available inline so
3057    we can fail fast without a lot of extra overhead.
3058    */
3059    if (scan->flags == EXACT || !do_utf8) {
3060     if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3061      DEBUG_EXECUTE_r(
3062       PerlIO_printf(Perl_debug_log,
3063          "%*s  %sfailed to match trie start class...%s\n",
3064          REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3065      );
3066      sayNO_SILENT;
3067      /* NOTREACHED */
3068     }
3069    }
3070    /* FALL THROUGH */
3071   case TRIE:
3072    {
3073     /* what type of TRIE am I? (utf8 makes this contextual) */
3074     DECL_TRIE_TYPE(scan);
3075
3076     /* what trie are we using right now */
3077     reg_trie_data * const trie
3078      = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3079     HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3080     U32 state = trie->startstate;
3081
3082     if (trie->bitmap && trie_type != trie_utf8_fold &&
3083      !TRIE_BITMAP_TEST(trie,*locinput)
3084     ) {
3085      if (trie->states[ state ].wordnum) {
3086       DEBUG_EXECUTE_r(
3087        PerlIO_printf(Perl_debug_log,
3088           "%*s  %smatched empty string...%s\n",
3089           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3090       );
3091       break;
3092      } else {
3093       DEBUG_EXECUTE_r(
3094        PerlIO_printf(Perl_debug_log,
3095           "%*s  %sfailed to match trie start class...%s\n",
3096           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3097       );
3098       sayNO_SILENT;
3099     }
3100     }
3101
3102    {
3103     U8 *uc = ( U8* )locinput;
3104
3105     STRLEN len = 0;
3106     STRLEN foldlen = 0;
3107     U8 *uscan = (U8*)NULL;
3108     STRLEN bufflen=0;
3109     SV *sv_accept_buff = NULL;
3110     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3111
3112      ST.accepted = 0; /* how many accepting states we have seen */
3113     ST.B = next;
3114     ST.jump = trie->jump;
3115     ST.me = scan;
3116     /*
3117     traverse the TRIE keeping track of all accepting states
3118     we transition through until we get to a failing node.
3119     */
3120
3121     while ( state && uc <= (U8*)PL_regeol ) {
3122      U32 base = trie->states[ state ].trans.base;
3123      UV uvc = 0;
3124      U16 charid;
3125      /* We use charid to hold the wordnum as we don't use it
3126      for charid until after we have done the wordnum logic.
3127      We define an alias just so that the wordnum logic reads
3128      more naturally. */
3129
3130 #define got_wordnum charid
3131      got_wordnum = trie->states[ state ].wordnum;
3132
3133      if ( got_wordnum ) {
3134       if ( ! ST.accepted ) {
3135        ENTER;
3136        SAVETMPS; /* XXX is this necessary? dmq */
3137        bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3138        sv_accept_buff=newSV(bufflen *
3139            sizeof(reg_trie_accepted) - 1);
3140        SvCUR_set(sv_accept_buff, 0);
3141        SvPOK_on(sv_accept_buff);
3142        sv_2mortal(sv_accept_buff);
3143        SAVETMPS;
3144        ST.accept_buff =
3145         (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3146       }
3147       do {
3148        if (ST.accepted >= bufflen) {
3149         bufflen *= 2;
3150         ST.accept_buff =(reg_trie_accepted*)
3151          SvGROW(sv_accept_buff,
3152            bufflen * sizeof(reg_trie_accepted));
3153        }
3154        SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3155         + sizeof(reg_trie_accepted));
3156
3157
3158        ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3159        ST.accept_buff[ST.accepted].endpos = uc;
3160        ++ST.accepted;
3161       } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3162      }
3163 #undef got_wordnum
3164
3165      DEBUG_TRIE_EXECUTE_r({
3166         DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3167         PerlIO_printf( Perl_debug_log,
3168          "%*s  %sState: %4"UVxf" Accepted: %4"UVxf" ",
3169          2+depth * 2, "", PL_colors[4],
3170          (UV)state, (UV)ST.accepted );
3171      });
3172
3173      if ( base ) {
3174       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3175            uscan, len, uvc, charid, foldlen,
3176            foldbuf, uniflags);
3177
3178       if (charid &&
3179        (base + charid > trie->uniquecharcount )
3180        && (base + charid - 1 - trie->uniquecharcount
3181          < trie->lasttrans)
3182        && trie->trans[base + charid - 1 -
3183          trie->uniquecharcount].check == state)
3184       {
3185        state = trie->trans[base + charid - 1 -
3186         trie->uniquecharcount ].next;
3187       }
3188       else {
3189        state = 0;
3190       }
3191       uc += len;
3192
3193      }
3194      else {
3195       state = 0;
3196      }
3197      DEBUG_TRIE_EXECUTE_r(
3198       PerlIO_printf( Perl_debug_log,
3199        "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3200        charid, uvc, (UV)state, PL_colors[5] );
3201      );
3202     }
3203     if (!ST.accepted )
3204     sayNO;
3205
3206     DEBUG_EXECUTE_r(
3207      PerlIO_printf( Perl_debug_log,
3208       "%*s  %sgot %"IVdf" possible matches%s\n",
3209       REPORT_CODE_OFF + depth * 2, "",
3210       PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3211     );
3212    }}
3213    goto trie_first_try; /* jump into the fail handler */
3214    /* NOTREACHED */
3215   case TRIE_next_fail: /* we failed - try next alterative */
3216    if ( ST.jump) {
3217     REGCP_UNWIND(ST.cp);
3218     for (n = *PL_reglastparen; n > ST.lastparen; n--)
3219      PL_regoffs[n].end = -1;
3220     *PL_reglastparen = n;
3221    }
3222   trie_first_try:
3223    if (do_cutgroup) {
3224     do_cutgroup = 0;
3225     no_final = 0;
3226    }
3227
3228    if ( ST.jump) {
3229     ST.lastparen = *PL_reglastparen;
3230     REGCP_SET(ST.cp);
3231    }
3232    if ( ST.accepted == 1 ) {
3233     /* only one choice left - just continue */
3234     DEBUG_EXECUTE_r({
3235      AV *const trie_words
3236       = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3237      SV ** const tmp = av_fetch( trie_words,
3238       ST.accept_buff[ 0 ].wordnum-1, 0 );
3239      SV *sv= tmp ? sv_newmortal() : NULL;
3240
3241      PerlIO_printf( Perl_debug_log,
3242       "%*s  %sonly one match left: #%d <%s>%s\n",
3243       REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3244       ST.accept_buff[ 0 ].wordnum,
3245       tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3246         PL_colors[0], PL_colors[1],
3247         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3248        )
3249       : "not compiled under -Dr",
3250       PL_colors[5] );
3251     });
3252     PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3253     /* in this case we free tmps/leave before we call regmatch
3254     as we wont be using accept_buff again. */
3255
3256     locinput = PL_reginput;
3257     nextchr = UCHARAT(locinput);
3258      if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3259       scan = ST.B;
3260      else
3261       scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3262     if (!has_cutgroup) {
3263      FREETMPS;
3264      LEAVE;
3265     } else {
3266      ST.accepted--;
3267      PUSH_YES_STATE_GOTO(TRIE_next, scan);
3268     }
3269
3270     continue; /* execute rest of RE */
3271    }
3272
3273    if ( !ST.accepted-- ) {
3274     DEBUG_EXECUTE_r({
3275      PerlIO_printf( Perl_debug_log,
3276       "%*s  %sTRIE failed...%s\n",
3277       REPORT_CODE_OFF+depth*2, "",
3278       PL_colors[4],
3279       PL_colors[5] );
3280     });
3281     FREETMPS;
3282     LEAVE;
3283     sayNO_SILENT;
3284     /*NOTREACHED*/
3285    }
3286
3287    /*
3288    There are at least two accepting states left.  Presumably
3289    the number of accepting states is going to be low,
3290    typically two. So we simply scan through to find the one
3291    with lowest wordnum.  Once we find it, we swap the last
3292    state into its place and decrement the size. We then try to
3293    match the rest of the pattern at the point where the word
3294    ends. If we succeed, control just continues along the
3295    regex; if we fail we return here to try the next accepting
3296    state
3297    */
3298
3299    {
3300     U32 best = 0;
3301     U32 cur;
3302     for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3303      DEBUG_TRIE_EXECUTE_r(
3304       PerlIO_printf( Perl_debug_log,
3305        "%*s  %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3306        REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3307        (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3308        ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3309      );
3310
3311      if (ST.accept_buff[cur].wordnum <
3312        ST.accept_buff[best].wordnum)
3313       best = cur;
3314     }
3315
3316     DEBUG_EXECUTE_r({
3317      AV *const trie_words
3318       = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3319      SV ** const tmp = av_fetch( trie_words,
3320       ST.accept_buff[ best ].wordnum - 1, 0 );
3321      regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3322          ST.B :
3323          ST.me + ST.jump[ST.accept_buff[best].wordnum];
3324      SV *sv= tmp ? sv_newmortal() : NULL;
3325
3326      PerlIO_printf( Perl_debug_log,
3327       "%*s  %strying alternation #%d <%s> at node #%d %s\n",
3328       REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3329       ST.accept_buff[best].wordnum,
3330       tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3331         PL_colors[0], PL_colors[1],
3332         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3333        ) : "not compiled under -Dr",
3334        REG_NODE_NUM(nextop),
3335       PL_colors[5] );
3336     });
3337
3338     if ( best<ST.accepted ) {
3339      reg_trie_accepted tmp = ST.accept_buff[ best ];
3340      ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3341      ST.accept_buff[ ST.accepted ] = tmp;
3342      best = ST.accepted;
3343     }
3344     PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3345     if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3346      scan = ST.B;
3347     } else {
3348      scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3349     }
3350     PUSH_YES_STATE_GOTO(TRIE_next, scan);
3351     /* NOTREACHED */
3352    }
3353    /* NOTREACHED */
3354   case TRIE_next:
3355    /* we dont want to throw this away, see bug 57042*/
3356    if (oreplsv != GvSV(PL_replgv))
3357     sv_setsv(oreplsv, GvSV(PL_replgv));
3358    FREETMPS;
3359    LEAVE;
3360    sayYES;
3361 #undef  ST
3362
3363   case EXACT: {
3364    char *s = STRING(scan);
3365    ln = STR_LEN(scan);
3366    if (do_utf8 != UTF) {
3367     /* The target and the pattern have differing utf8ness. */
3368     char *l = locinput;
3369     const char * const e = s + ln;
3370
3371     if (do_utf8) {
3372      /* The target is utf8, the pattern is not utf8. */
3373      while (s < e) {
3374       STRLEN ulen;
3375       if (l >= PL_regeol)
3376        sayNO;
3377       if (NATIVE_TO_UNI(*(U8*)s) !=
3378        utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3379            uniflags))
3380        sayNO;
3381       l += ulen;
3382       s ++;
3383      }
3384     }
3385     else {
3386      /* The target is not utf8, the pattern is utf8. */
3387      while (s < e) {
3388       STRLEN ulen;
3389       if (l >= PL_regeol)
3390        sayNO;
3391       if (NATIVE_TO_UNI(*((U8*)l)) !=
3392        utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3393           uniflags))
3394        sayNO;
3395       s += ulen;
3396       l ++;
3397      }
3398     }
3399     locinput = l;
3400     nextchr = UCHARAT(locinput);
3401     break;
3402    }
3403    /* The target and the pattern have the same utf8ness. */
3404    /* Inline the first character, for speed. */
3405    if (UCHARAT(s) != nextchr)
3406     sayNO;
3407    if (PL_regeol - locinput < ln)
3408     sayNO;
3409    if (ln > 1 && memNE(s, locinput, ln))
3410     sayNO;
3411    locinput += ln;
3412    nextchr = UCHARAT(locinput);
3413    break;
3414    }
3415   case EXACTFL:
3416    PL_reg_flags |= RF_tainted;
3417    /* FALL THROUGH */
3418   case EXACTF: {
3419    char * const s = STRING(scan);
3420    ln = STR_LEN(scan);
3421
3422    if (do_utf8 || UTF) {
3423    /* Either target or the pattern are utf8. */
3424     const char * const l = locinput;
3425     char *e = PL_regeol;
3426
3427     if (ibcmp_utf8(s, 0,  ln, (bool)UTF,
3428        l, &e, 0,  do_utf8)) {
3429      /* One more case for the sharp s:
3430      * pack("U0U*", 0xDF) =~ /ss/i,
3431      * the 0xC3 0x9F are the UTF-8
3432      * byte sequence for the U+00DF. */
3433
3434      if (!(do_utf8 &&
3435       toLOWER(s[0]) == 's' &&
3436       ln >= 2 &&
3437       toLOWER(s[1]) == 's' &&
3438       (U8)l[0] == 0xC3 &&
3439       e - l >= 2 &&
3440       (U8)l[1] == 0x9F))
3441       sayNO;
3442     }
3443     locinput = e;
3444     nextchr = UCHARAT(locinput);
3445     break;
3446    }
3447
3448    /* Neither the target and the pattern are utf8. */
3449
3450    /* Inline the first character, for speed. */
3451    if (UCHARAT(s) != nextchr &&
3452     UCHARAT(s) != ((OP(scan) == EXACTF)
3453        ? PL_fold : PL_fold_locale)[nextchr])
3454     sayNO;
3455    if (PL_regeol - locinput < ln)
3456     sayNO;
3457    if (ln > 1 && (OP(scan) == EXACTF
3458       ? ibcmp(s, locinput, ln)
3459       : ibcmp_locale(s, locinput, ln)))
3460     sayNO;
3461    locinput += ln;
3462    nextchr = UCHARAT(locinput);
3463    break;
3464    }
3465   case BOUNDL:
3466   case NBOUNDL:
3467    PL_reg_flags |= RF_tainted;
3468    /* FALL THROUGH */
3469   case BOUND:
3470   case NBOUND:
3471    /* was last char in word? */
3472    if (do_utf8) {
3473     if (locinput == PL_bostr)
3474      ln = '\n';
3475     else {
3476      const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3477
3478      ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3479     }
3480     if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3481      ln = isALNUM_uni(ln);
3482      LOAD_UTF8_CHARCLASS_ALNUM();
3483      n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3484     }
3485     else {
3486      ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3487      n = isALNUM_LC_utf8((U8*)locinput);
3488     }
3489    }
3490    else {
3491     ln = (locinput != PL_bostr) ?
3492      UCHARAT(locinput - 1) : '\n';
3493     if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3494      ln = isALNUM(ln);
3495      n = isALNUM(nextchr);
3496     }
3497     else {
3498      ln = isALNUM_LC(ln);
3499      n = isALNUM_LC(nextchr);
3500     }
3501    }
3502    if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3503          OP(scan) == BOUNDL))
3504      sayNO;
3505    break;
3506   case ANYOF:
3507    if (do_utf8) {
3508     STRLEN inclasslen = PL_regeol - locinput;
3509
3510     if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3511      goto anyof_fail;
3512     if (locinput >= PL_regeol)
3513      sayNO;
3514     locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3515     nextchr = UCHARAT(locinput);
3516     break;
3517    }
3518    else {
3519     if (nextchr < 0)
3520      nextchr = UCHARAT(locinput);
3521     if (!REGINCLASS(rex, scan, (U8*)locinput))
3522      goto anyof_fail;
3523     if (!nextchr && locinput >= PL_regeol)
3524      sayNO;
3525     nextchr = UCHARAT(++locinput);
3526     break;
3527    }
3528   anyof_fail:
3529    /* If we might have the case of the German sharp s
3530    * in a casefolding Unicode character class. */
3531
3532    if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3533     locinput += SHARP_S_SKIP;
3534     nextchr = UCHARAT(locinput);
3535    }
3536    else
3537     sayNO;
3538    break;
3539   /* Special char classes - The defines start on line 129 or so */
3540   CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3541   CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3542
3543   CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3544   CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3545
3546   CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3547   CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3548
3549   case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3550      a Unicode extended Grapheme Cluster */
3551    /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3552    extended Grapheme Cluster is:
3553
3554    CR LF
3555    | Prepend* Begin Extend*
3556    | .
3557
3558    Begin is (Hangul-syllable | ! Control)
3559    Extend is (Grapheme_Extend | Spacing_Mark)
3560    Control is [ GCB_Control CR LF ]
3561
3562    The discussion below shows how the code for CLUMP is derived
3563    from this regex.  Note that most of these concepts are from
3564    property values of the Grapheme Cluster Boundary (GCB) property.
3565    No code point can have multiple property values for a given
3566    property.  Thus a code point in Prepend can't be in Control, but
3567    it must be in !Control.  This is why Control above includes
3568    GCB_Control plus CR plus LF.  The latter two are used in the GCB
3569    property separately, and so can't be in GCB_Control, even though
3570    they logically are controls.  Control is not the same as gc=cc,
3571    but includes format and other characters as well.
3572
3573    The Unicode definition of Hangul-syllable is:
3574     L+
3575     | (L* ( ( V | LV ) V* | LVT ) T*)
3576     | T+
3577     )
3578    Each of these is a value for the GCB property, and hence must be
3579    disjoint, so the order they are tested is immaterial, so the
3580    above can safely be changed to
3581     T+
3582     | L+
3583     | (L* ( LVT | ( V | LV ) V*) T*)
3584
3585    The last two terms can be combined like this:
3586     L* ( L
3587       | (( LVT | ( V | LV ) V*) T*))
3588
3589    And refactored into this:
3590     L* (L | LVT T* | V  V* T* | LV  V* T*)
3591
3592    That means that if we have seen any L's at all we can quit
3593    there, but if the next character is a LVT, a V or and LV we
3594    should keep going.
3595
3596    There is a subtlety with Prepend* which showed up in testing.
3597    Note that the Begin, and only the Begin is required in:
3598     | Prepend* Begin Extend*
3599    Also, Begin contains '! Control'.  A Prepend must be a '!
3600    Control', which means it must be a Begin.  What it comes down to
3601    is that if we match Prepend* and then find no suitable Begin
3602    afterwards, that if we backtrack the last Prepend, that one will
3603    be a suitable Begin.
3604    */
3605
3606    if (locinput >= PL_regeol)
3607     sayNO;
3608    if  (! do_utf8) {
3609
3610     /* Match either CR LF  or '.', as all the other possibilities
3611     * require utf8 */
3612     locinput++;     /* Match the . or CR */
3613     if (nextchr == '\r'
3614      && locinput < PL_regeol
3615      && UCHARAT(locinput) == '\n') locinput++;
3616    }
3617    else {
3618
3619     /* Utf8: See if is ( CR LF ); already know that locinput <
3620     * PL_regeol, so locinput+1 is in bounds */
3621     if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3622      locinput += 2;
3623     }
3624     else {
3625      /* In case have to backtrack to beginning, then match '.' */
3626      char *starting = locinput;
3627
3628      /* In case have to backtrack the last prepend */
3629      char *previous_prepend = 0;
3630
3631      LOAD_UTF8_CHARCLASS_GCB();
3632
3633      /* Match (prepend)* */
3634      while (locinput < PL_regeol
3635       && swash_fetch(PL_utf8_X_prepend,
3636           (U8*)locinput, do_utf8))
3637      {
3638       previous_prepend = locinput;
3639       locinput += UTF8SKIP(locinput);
3640      }
3641
3642      /* As noted above, if we matched a prepend character, but
3643      * the next thing won't match, back off the last prepend we
3644      * matched, as it is guaranteed to match the begin */
3645      if (previous_prepend
3646       && (locinput >=  PL_regeol
3647        || ! swash_fetch(PL_utf8_X_begin,
3648            (U8*)locinput, do_utf8)))
3649      {
3650       locinput = previous_prepend;
3651      }
3652
3653      /* Note that here we know PL_regeol > locinput, as we
3654      * tested that upon input to this switch case, and if we
3655      * moved locinput forward, we tested the result just above
3656      * and it either passed, or we backed off so that it will
3657      * now pass */
3658      if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) {
3659
3660       /* Here did not match the required 'Begin' in the
3661       * second term.  So just match the very first
3662       * character, the '.' of the final term of the regex */
3663       locinput = starting + UTF8SKIP(starting);
3664      } else {
3665
3666       /* Here is the beginning of a character that can have
3667       * an extender.  It is either a hangul syllable, or a
3668       * non-control */
3669       if (swash_fetch(PL_utf8_X_non_hangul,
3670           (U8*)locinput, do_utf8))
3671       {
3672
3673        /* Here not a Hangul syllable, must be a
3674        * ('!  * Control') */
3675        locinput += UTF8SKIP(locinput);
3676       } else {
3677
3678        /* Here is a Hangul syllable.  It can be composed
3679        * of several individual characters.  One
3680        * possibility is T+ */
3681        if (swash_fetch(PL_utf8_X_T,
3682            (U8*)locinput, do_utf8))
3683        {
3684         while (locinput < PL_regeol
3685           && swash_fetch(PL_utf8_X_T,
3686               (U8*)locinput, do_utf8))
3687         {
3688          locinput += UTF8SKIP(locinput);
3689         }
3690        } else {
3691
3692         /* Here, not T+, but is a Hangul.  That means
3693         * it is one of the others: L, LV, LVT or V,
3694         * and matches:
3695         * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3696
3697         /* Match L*           */
3698         while (locinput < PL_regeol
3699           && swash_fetch(PL_utf8_X_L,
3700               (U8*)locinput, do_utf8))
3701         {
3702          locinput += UTF8SKIP(locinput);
3703         }
3704
3705         /* Here, have exhausted L*.  If the next
3706         * character is not an LV, LVT nor V, it means
3707         * we had to have at least one L, so matches L+
3708         * in the original equation, we have a complete
3709         * hangul syllable.  Are done. */
3710
3711         if (locinput < PL_regeol
3712          && swash_fetch(PL_utf8_X_LV_LVT_V,
3713              (U8*)locinput, do_utf8))
3714         {
3715
3716          /* Otherwise keep going.  Must be LV, LVT
3717          * or V.  See if LVT */
3718          if (swash_fetch(PL_utf8_X_LVT,
3719              (U8*)locinput, do_utf8))
3720          {
3721           locinput += UTF8SKIP(locinput);
3722          } else {
3723
3724           /* Must be  V or LV.  Take it, then
3725           * match V*     */
3726           locinput += UTF8SKIP(locinput);
3727           while (locinput < PL_regeol
3728             && swash_fetch(PL_utf8_X_V,
3729               (U8*)locinput, do_utf8))
3730           {
3731            locinput += UTF8SKIP(locinput);
3732           }
3733          }
3734
3735          /* And any of LV, LVT, or V can be followed
3736          * by T*            */
3737          while (locinput < PL_regeol
3738           && swash_fetch(PL_utf8_X_T,
3739               (U8*)locinput,
3740               do_utf8))
3741          {
3742           locinput += UTF8SKIP(locinput);
3743          }
3744         }
3745        }
3746       }
3747
3748       /* Match any extender */
3749       while (locinput < PL_regeol
3750         && swash_fetch(PL_utf8_X_extend,
3751             (U8*)locinput, do_utf8))
3752       {
3753        locinput += UTF8SKIP(locinput);
3754       }
3755      }
3756     }
3757     if (locinput > PL_regeol) sayNO;
3758    }
3759    nextchr = UCHARAT(locinput);
3760    break;
3761
3762   case NREFFL:
3763   {
3764    char *s;
3765    char type;
3766    PL_reg_flags |= RF_tainted;
3767    /* FALL THROUGH */
3768   case NREF:
3769   case NREFF:
3770    type = OP(scan);
3771    n = reg_check_named_buff_matched(rex,scan);
3772
3773    if ( n ) {
3774     type = REF + ( type - NREF );
3775     goto do_ref;
3776    } else {
3777     sayNO;
3778    }
3779    /* unreached */
3780   case REFFL:
3781    PL_reg_flags |= RF_tainted;
3782    /* FALL THROUGH */
3783   case REF:
3784   case REFF:
3785    n = ARG(scan);  /* which paren pair */
3786    type = OP(scan);
3787   do_ref:
3788    ln = PL_regoffs[n].start;
3789    PL_reg_leftiter = PL_reg_maxiter;  /* Void cache */
3790    if (*PL_reglastparen < n || ln == -1)
3791     sayNO;   /* Do not match unless seen CLOSEn. */
3792    if (ln == PL_regoffs[n].end)
3793     break;
3794
3795    s = PL_bostr + ln;
3796    if (do_utf8 && type != REF) { /* REF can do byte comparison */
3797     char *l = locinput;
3798     const char *e = PL_bostr + PL_regoffs[n].end;
3799     /*
3800     * Note that we can't do the "other character" lookup trick as
3801     * in the 8-bit case (no pun intended) because in Unicode we
3802     * have to map both upper and title case to lower case.
3803     */
3804     if (type == REFF) {
3805      while (s < e) {
3806       STRLEN ulen1, ulen2;
3807       U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3808       U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3809
3810       if (l >= PL_regeol)
3811        sayNO;
3812       toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3813       toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3814       if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3815        sayNO;
3816       s += ulen1;
3817       l += ulen2;
3818      }
3819     }
3820     locinput = l;
3821     nextchr = UCHARAT(locinput);
3822     break;
3823    }
3824
3825    /* Inline the first character, for speed. */
3826    if (UCHARAT(s) != nextchr &&
3827     (type == REF ||
3828     (UCHARAT(s) != (type == REFF
3829         ? PL_fold : PL_fold_locale)[nextchr])))
3830     sayNO;
3831    ln = PL_regoffs[n].end - ln;
3832    if (locinput + ln > PL_regeol)
3833     sayNO;
3834    if (ln > 1 && (type == REF
3835       ? memNE(s, locinput, ln)
3836       : (type == REFF
3837        ? ibcmp(s, locinput, ln)
3838        : ibcmp_locale(s, locinput, ln))))
3839     sayNO;
3840    locinput += ln;
3841    nextchr = UCHARAT(locinput);
3842    break;
3843   }
3844   case NOTHING:
3845   case TAIL:
3846    break;
3847   case BACK:
3848    break;
3849
3850 #undef  ST
3851 #define ST st->u.eval
3852   {
3853    SV *ret;
3854    REGEXP *re_sv;
3855    regexp *re;
3856    regexp_internal *rei;
3857    regnode *startpoint;
3858
3859   case GOSTART:
3860   case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3861    if (cur_eval && cur_eval->locinput==locinput) {
3862     if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3863      Perl_croak(aTHX_ "Infinite recursion in regex");
3864     if ( ++nochange_depth > max_nochange_depth )
3865      Perl_croak(aTHX_
3866       "Pattern subroutine nesting without pos change"
3867       " exceeded limit in regex");
3868    } else {
3869     nochange_depth = 0;
3870    }
3871    re_sv = rex_sv;
3872    re = rex;
3873    rei = rexi;
3874    (void)ReREFCNT_inc(rex_sv);
3875    if (OP(scan)==GOSUB) {
3876     startpoint = scan + ARG2L(scan);
3877     ST.close_paren = ARG(scan);
3878    } else {
3879     startpoint = rei->program+1;
3880     ST.close_paren = 0;
3881    }
3882    goto eval_recurse_doit;
3883    /* NOTREACHED */
3884   case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
3885    if (cur_eval && cur_eval->locinput==locinput) {
3886     if ( ++nochange_depth > max_nochange_depth )
3887      Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3888    } else {
3889     nochange_depth = 0;
3890    }
3891    {
3892     /* execute the code in the {...} */
3893     dSP;
3894     SV ** const before = SP;
3895     OP_4tree * const oop = PL_op;
3896     COP * const ocurcop = PL_curcop;
3897     PAD *old_comppad;
3898     char *saved_regeol = PL_regeol;
3899
3900     n = ARG(scan);
3901     PL_op = (OP_4tree*)rexi->data->data[n];
3902     DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3903      "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3904     PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3905     PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3906
3907     if (sv_yes_mark) {
3908      SV *sv_mrk = get_sv("REGMARK", 1);
3909      sv_setsv(sv_mrk, sv_yes_mark);
3910     }
3911
3912     CALLRUNOPS(aTHX);   /* Scalar context. */
3913     SPAGAIN;
3914     if (SP == before)
3915      ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
3916     else {
3917      ret = POPs;
3918      PUTBACK;
3919     }
3920
3921     PL_op = oop;
3922     PAD_RESTORE_LOCAL(old_comppad);
3923     PL_curcop = ocurcop;
3924     PL_regeol = saved_regeol;
3925     if (!logical) {
3926      /* /(?{...})/ */
3927      sv_setsv(save_scalar(PL_replgv), ret);
3928      break;
3929     }
3930    }
3931    if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3932     logical = 0;
3933     {
3934      /* extract RE object from returned value; compiling if
3935      * necessary */
3936      MAGIC *mg = NULL;
3937      REGEXP *rx = NULL;
3938
3939      if (SvROK(ret)) {
3940       SV *const sv = SvRV(ret);
3941
3942       if (SvTYPE(sv) == SVt_REGEXP) {
3943        rx = (REGEXP*) sv;
3944       } else if (SvSMAGICAL(sv)) {
3945        mg = mg_find(sv, PERL_MAGIC_qr);
3946        assert(mg);
3947       }
3948      } else if (SvTYPE(ret) == SVt_REGEXP) {
3949       rx = (REGEXP*) ret;
3950      } else if (SvSMAGICAL(ret)) {
3951       if (SvGMAGICAL(ret)) {
3952        /* I don't believe that there is ever qr magic
3953        here.  */
3954        assert(!mg_find(ret, PERL_MAGIC_qr));
3955        sv_unmagic(ret, PERL_MAGIC_qr);
3956       }
3957       else {
3958        mg = mg_find(ret, PERL_MAGIC_qr);
3959        /* testing suggests mg only ends up non-NULL for
3960        scalars who were upgraded and compiled in the
3961        else block below. In turn, this is only
3962        triggered in the "postponed utf8 string" tests
3963        in t/op/pat.t  */
3964       }
3965      }
3966
3967      if (mg) {
3968       rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
3969       assert(rx);
3970      }
3971      if (rx) {
3972       rx = reg_temp_copy(NULL, rx);
3973      }
3974      else {
3975       U32 pm_flags = 0;
3976       const I32 osize = PL_regsize;
3977
3978       if (DO_UTF8(ret)) {
3979        assert (SvUTF8(ret));
3980       } else if (SvUTF8(ret)) {
3981        /* Not doing UTF-8, despite what the SV says. Is
3982        this only if we're trapped in use 'bytes'?  */
3983        /* Make a copy of the octet sequence, but without
3984        the flag on, as the compiler now honours the
3985        SvUTF8 flag on ret.  */
3986        STRLEN len;
3987        const char *const p = SvPV(ret, len);
3988        ret = newSVpvn_flags(p, len, SVs_TEMP);
3989       }
3990       rx = CALLREGCOMP(ret, pm_flags);
3991       if (!(SvFLAGS(ret)
3992        & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3993         | SVs_GMG))) {
3994        /* This isn't a first class regexp. Instead, it's
3995        caching a regexp onto an existing, Perl visible
3996        scalar.  */
3997        sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3998       }
3999       PL_regsize = osize;
4000      }
4001      re_sv = rx;
4002      re = (struct regexp *)SvANY(rx);
4003     }
4004     RXp_MATCH_COPIED_off(re);
4005     re->subbeg = rex->subbeg;
4006     re->sublen = rex->sublen;
4007     rei = RXi_GET(re);
4008     DEBUG_EXECUTE_r(
4009      debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
4010       "Matching embedded");
4011     );
4012     startpoint = rei->program + 1;
4013      ST.close_paren = 0; /* only used for GOSUB */
4014      /* borrowed from regtry */
4015     if (PL_reg_start_tmpl <= re->nparens) {
4016      PL_reg_start_tmpl = re->nparens*3/2 + 3;
4017      if(PL_reg_start_tmp)
4018       Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4019      else
4020       Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4021     }
4022
4023   eval_recurse_doit: /* Share code with GOSUB below this line */
4024     /* run the pattern returned from (??{...}) */
4025     ST.cp = regcppush(0); /* Save *all* the positions. */
4026     REGCP_SET(ST.lastcp);
4027
4028     PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4029
4030     /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4031     PL_reglastparen = &re->lastparen;
4032     PL_reglastcloseparen = &re->lastcloseparen;
4033     re->lastparen = 0;
4034     re->lastcloseparen = 0;
4035
4036     PL_reginput = locinput;
4037     PL_regsize = 0;
4038
4039     /* XXXX This is too dramatic a measure... */
4040     PL_reg_maxiter = 0;
4041
4042     ST.toggle_reg_flags = PL_reg_flags;
4043     if (RX_UTF8(re_sv))
4044      PL_reg_flags |= RF_utf8;
4045     else
4046      PL_reg_flags &= ~RF_utf8;
4047     ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4048
4049     ST.prev_rex = rex_sv;
4050     ST.prev_curlyx = cur_curlyx;
4051     SETREX(rex_sv,re_sv);
4052     rex = re;
4053     rexi = rei;
4054     cur_curlyx = NULL;
4055     ST.B = next;
4056     ST.prev_eval = cur_eval;
4057     cur_eval = st;
4058     /* now continue from first node in postoned RE */
4059     PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4060     /* NOTREACHED */
4061    }
4062    /* logical is 1,   /(?(?{...})X|Y)/ */
4063    sw = (bool)SvTRUE(ret);
4064    logical = 0;
4065    break;
4066   }
4067
4068   case EVAL_AB: /* cleanup after a successful (??{A})B */
4069    /* note: this is called twice; first after popping B, then A */
4070    PL_reg_flags ^= ST.toggle_reg_flags;
4071    ReREFCNT_dec(rex_sv);
4072    SETREX(rex_sv,ST.prev_rex);
4073    rex = (struct regexp *)SvANY(rex_sv);
4074    rexi = RXi_GET(rex);
4075    regcpblow(ST.cp);
4076    cur_eval = ST.prev_eval;
4077    cur_curlyx = ST.prev_curlyx;
4078
4079    /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4080    PL_reglastparen = &rex->lastparen;
4081    PL_reglastcloseparen = &rex->lastcloseparen;
4082    /* also update PL_regoffs */
4083    PL_regoffs = rex->offs;
4084
4085    /* XXXX This is too dramatic a measure... */
4086    PL_reg_maxiter = 0;
4087    if ( nochange_depth )
4088     nochange_depth--;
4089    sayYES;
4090
4091
4092   case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4093    /* note: this is called twice; first after popping B, then A */
4094    PL_reg_flags ^= ST.toggle_reg_flags;
4095    ReREFCNT_dec(rex_sv);
4096    SETREX(rex_sv,ST.prev_rex);
4097    rex = (struct regexp *)SvANY(rex_sv);
4098    rexi = RXi_GET(rex);
4099    /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4100    PL_reglastparen = &rex->lastparen;
4101    PL_reglastcloseparen = &rex->lastcloseparen;
4102
4103    PL_reginput = locinput;
4104    REGCP_UNWIND(ST.lastcp);
4105    regcppop(rex);
4106    cur_eval = ST.prev_eval;
4107    cur_curlyx = ST.prev_curlyx;
4108    /* XXXX This is too dramatic a measure... */
4109    PL_reg_maxiter = 0;
4110    if ( nochange_depth )
4111     nochange_depth--;
4112    sayNO_SILENT;
4113 #undef ST
4114
4115   case OPEN:
4116    n = ARG(scan);  /* which paren pair */
4117    PL_reg_start_tmp[n] = locinput;
4118    if (n > PL_regsize)
4119     PL_regsize = n;
4120    lastopen = n;
4121    break;
4122   case CLOSE:
4123    n = ARG(scan);  /* which paren pair */
4124    PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4125    PL_regoffs[n].end = locinput - PL_bostr;
4126    /*if (n > PL_regsize)
4127     PL_regsize = n;*/
4128    if (n > *PL_reglastparen)
4129     *PL_reglastparen = n;
4130    *PL_reglastcloseparen = n;
4131    if (cur_eval && cur_eval->u.eval.close_paren == n) {
4132     goto fake_end;
4133    }
4134    break;
4135   case ACCEPT:
4136    if (ARG(scan)){
4137     regnode *cursor;
4138     for (cursor=scan;
4139      cursor && OP(cursor)!=END;
4140      cursor=regnext(cursor))
4141     {
4142      if ( OP(cursor)==CLOSE ){
4143       n = ARG(cursor);
4144       if ( n <= lastopen ) {
4145        PL_regoffs[n].start
4146         = PL_reg_start_tmp[n] - PL_bostr;
4147        PL_regoffs[n].end = locinput - PL_bostr;
4148        /*if (n > PL_regsize)
4149        PL_regsize = n;*/
4150        if (n > *PL_reglastparen)
4151         *PL_reglastparen = n;
4152        *PL_reglastcloseparen = n;
4153        if ( n == ARG(scan) || (cur_eval &&
4154         cur_eval->u.eval.close_paren == n))
4155         break;
4156       }
4157      }
4158     }
4159    }
4160    goto fake_end;
4161    /*NOTREACHED*/
4162   case GROUPP:
4163    n = ARG(scan);  /* which paren pair */
4164    sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4165    break;
4166   case NGROUPP:
4167    /* reg_check_named_buff_matched returns 0 for no match */
4168    sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
4169    break;
4170   case INSUBP:
4171    n = ARG(scan);
4172    sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4173    break;
4174   case DEFINEP:
4175    sw = 0;
4176    break;
4177   case IFTHEN:
4178    PL_reg_leftiter = PL_reg_maxiter;  /* Void cache */
4179    if (sw)
4180     next = NEXTOPER(NEXTOPER(scan));
4181    else {
4182     next = scan + ARG(scan);
4183     if (OP(next) == IFTHEN) /* Fake one. */
4184      next = NEXTOPER(NEXTOPER(next));
4185    }
4186    break;
4187   case LOGICAL:
4188    logical = scan->flags;
4189    break;
4190
4191 /*******************************************************************
4192
4193 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4194 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4195 STAR/PLUS/CURLY/CURLYN are used instead.)
4196
4197 A*B is compiled as <CURLYX><A><WHILEM><B>
4198
4199 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4200 state, which contains the current count, initialised to -1. It also sets
4201 cur_curlyx to point to this state, with any previous value saved in the
4202 state block.
4203
4204 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4205 since the pattern may possibly match zero times (i.e. it's a while {} loop
4206 rather than a do {} while loop).
4207
4208 Each entry to WHILEM represents a successful match of A. The count in the
4209 CURLYX block is incremented, another WHILEM state is pushed, and execution
4210 passes to A or B depending on greediness and the current count.
4211
4212 For example, if matching against the string a1a2a3b (where the aN are
4213 substrings that match /A/), then the match progresses as follows: (the
4214 pushed states are interspersed with the bits of strings matched so far):
4215
4216  <CURLYX cnt=-1>
4217  <CURLYX cnt=0><WHILEM>
4218  <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4219  <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4220  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4221  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4222
4223 (Contrast this with something like CURLYM, which maintains only a single
4224 backtrack state:
4225
4226  <CURLYM cnt=0> a1
4227  a1 <CURLYM cnt=1> a2
4228  a1 a2 <CURLYM cnt=2> a3
4229  a1 a2 a3 <CURLYM cnt=3> b
4230 )
4231
4232 Each WHILEM state block marks a point to backtrack to upon partial failure
4233 of A or B, and also contains some minor state data related to that
4234 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4235 overall state, such as the count, and pointers to the A and B ops.
4236
4237 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4238 must always point to the *current* CURLYX block, the rules are:
4239
4240 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4241 and set cur_curlyx to point the new block.
4242
4243 When popping the CURLYX block after a successful or unsuccessful match,
4244 restore the previous cur_curlyx.
4245
4246 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4247 to the outer one saved in the CURLYX block.
4248
4249 When popping the WHILEM block after a successful or unsuccessful B match,
4250 restore the previous cur_curlyx.
4251
4252 Here's an example for the pattern (AI* BI)*BO
4253 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4254
4255 cur_
4256 curlyx backtrack stack
4257 ------ ---------------
4258 NULL
4259 CO     <CO prev=NULL> <WO>
4260 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4261 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4262 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4263
4264 At this point the pattern succeeds, and we work back down the stack to
4265 clean up, restoring as we go:
4266
4267 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4268 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4269 CO     <CO prev=NULL> <WO>
4270 NULL
4271
4272 *******************************************************************/
4273
4274 #define ST st->u.curlyx
4275
4276   case CURLYX:    /* start of /A*B/  (for complex A) */
4277   {
4278    /* No need to save/restore up to this paren */
4279    I32 parenfloor = scan->flags;
4280
4281    assert(next); /* keep Coverity happy */
4282    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4283     next += ARG(next);
4284
4285    /* XXXX Probably it is better to teach regpush to support
4286    parenfloor > PL_regsize... */
4287    if (parenfloor > (I32)*PL_reglastparen)
4288     parenfloor = *PL_reglastparen; /* Pessimization... */
4289
4290    ST.prev_curlyx= cur_curlyx;
4291    cur_curlyx = st;
4292    ST.cp = PL_savestack_ix;
4293
4294    /* these fields contain the state of the current curly.
4295    * they are accessed by subsequent WHILEMs */
4296    ST.parenfloor = parenfloor;
4297    ST.min = ARG1(scan);
4298    ST.max = ARG2(scan);
4299    ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4300    ST.B = next;
4301    ST.minmod = minmod;
4302    minmod = 0;
4303    ST.count = -1; /* this will be updated by WHILEM */
4304    ST.lastloc = NULL;  /* this will be updated by WHILEM */
4305
4306    PL_reginput = locinput;
4307    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4308    /* NOTREACHED */
4309   }
4310
4311   case CURLYX_end: /* just finished matching all of A*B */
4312    cur_curlyx = ST.prev_curlyx;
4313    sayYES;
4314    /* NOTREACHED */
4315
4316   case CURLYX_end_fail: /* just failed to match all of A*B */
4317    regcpblow(ST.cp);
4318    cur_curlyx = ST.prev_curlyx;
4319    sayNO;
4320    /* NOTREACHED */
4321
4322
4323 #undef ST
4324 #define ST st->u.whilem
4325
4326   case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4327   {
4328    /* see the discussion above about CURLYX/WHILEM */
4329    I32 n;
4330    assert(cur_curlyx); /* keep Coverity happy */
4331    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4332    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4333    ST.cache_offset = 0;
4334    ST.cache_mask = 0;
4335
4336    PL_reginput = locinput;
4337
4338    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4339     "%*s  whilem: matched %ld out of %ld..%ld\n",
4340     REPORT_CODE_OFF+depth*2, "", (long)n,
4341     (long)cur_curlyx->u.curlyx.min,
4342     (long)cur_curlyx->u.curlyx.max)
4343    );
4344
4345    /* First just match a string of min A's. */
4346
4347    if (n < cur_curlyx->u.curlyx.min) {
4348     cur_curlyx->u.curlyx.lastloc = locinput;
4349     PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4350     /* NOTREACHED */
4351    }
4352
4353    /* If degenerate A matches "", assume A done. */
4354
4355    if (locinput == cur_curlyx->u.curlyx.lastloc) {
4356     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4357     "%*s  whilem: empty match detected, trying continuation...\n",
4358     REPORT_CODE_OFF+depth*2, "")
4359     );
4360     goto do_whilem_B_max;
4361    }
4362
4363    /* super-linear cache processing */
4364
4365    if (scan->flags) {
4366
4367     if (!PL_reg_maxiter) {
4368      /* start the countdown: Postpone detection until we
4369      * know the match is not *that* much linear. */
4370      PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4371      /* possible overflow for long strings and many CURLYX's */
4372      if (PL_reg_maxiter < 0)
4373       PL_reg_maxiter = I32_MAX;
4374      PL_reg_leftiter = PL_reg_maxiter;
4375     }
4376
4377     if (PL_reg_leftiter-- == 0) {
4378      /* initialise cache */
4379      const I32 size = (PL_reg_maxiter + 7)/8;
4380      if (PL_reg_poscache) {
4381       if ((I32)PL_reg_poscache_size < size) {
4382        Renew(PL_reg_poscache, size, char);
4383        PL_reg_poscache_size = size;
4384       }
4385       Zero(PL_reg_poscache, size, char);
4386      }
4387      else {
4388       PL_reg_poscache_size = size;
4389       Newxz(PL_reg_poscache, size, char);
4390      }
4391      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4392  "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4393        PL_colors[4], PL_colors[5])
4394      );
4395     }
4396
4397     if (PL_reg_leftiter < 0) {
4398      /* have we already failed at this position? */
4399      I32 offset, mask;
4400      offset  = (scan->flags & 0xf) - 1
4401         + (locinput - PL_bostr)  * (scan->flags>>4);
4402      mask    = 1 << (offset % 8);
4403      offset /= 8;
4404      if (PL_reg_poscache[offset] & mask) {
4405       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4406        "%*s  whilem: (cache) already tried at this position...\n",
4407        REPORT_CODE_OFF+depth*2, "")
4408       );
4409       sayNO; /* cache records failure */
4410      }
4411      ST.cache_offset = offset;
4412      ST.cache_mask   = mask;
4413     }
4414    }
4415
4416    /* Prefer B over A for minimal matching. */
4417
4418    if (cur_curlyx->u.curlyx.minmod) {
4419     ST.save_curlyx = cur_curlyx;
4420     cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4421     ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4422     REGCP_SET(ST.lastcp);
4423     PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4424     /* NOTREACHED */
4425    }
4426
4427    /* Prefer A over B for maximal matching. */
4428
4429    if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4430     ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4431     cur_curlyx->u.curlyx.lastloc = locinput;
4432     REGCP_SET(ST.lastcp);
4433     PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4434     /* NOTREACHED */
4435    }
4436    goto do_whilem_B_max;
4437   }
4438   /* NOTREACHED */
4439
4440   case WHILEM_B_min: /* just matched B in a minimal match */
4441   case WHILEM_B_max: /* just matched B in a maximal match */
4442    cur_curlyx = ST.save_curlyx;
4443    sayYES;
4444    /* NOTREACHED */
4445
4446   case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4447    cur_curlyx = ST.save_curlyx;
4448    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4449    cur_curlyx->u.curlyx.count--;
4450    CACHEsayNO;
4451    /* NOTREACHED */
4452
4453   case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4454    REGCP_UNWIND(ST.lastcp);
4455    regcppop(rex);
4456    /* FALL THROUGH */
4457   case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4458    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4459    cur_curlyx->u.curlyx.count--;
4460    CACHEsayNO;
4461    /* NOTREACHED */
4462
4463   case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4464    REGCP_UNWIND(ST.lastcp);
4465    regcppop(rex); /* Restore some previous $<digit>s? */
4466    PL_reginput = locinput;
4467    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4468     "%*s  whilem: failed, trying continuation...\n",
4469     REPORT_CODE_OFF+depth*2, "")
4470    );
4471   do_whilem_B_max:
4472    if (cur_curlyx->u.curlyx.count >= REG_INFTY
4473     && ckWARN(WARN_REGEXP)
4474     && !(PL_reg_flags & RF_warned))
4475    {
4476     PL_reg_flags |= RF_warned;
4477     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4478      "Complex regular subexpression recursion",
4479      REG_INFTY - 1);
4480    }
4481
4482    /* now try B */
4483    ST.save_curlyx = cur_curlyx;
4484    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4485    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4486    /* NOTREACHED */
4487
4488   case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4489    cur_curlyx = ST.save_curlyx;
4490    REGCP_UNWIND(ST.lastcp);
4491    regcppop(rex);
4492
4493    if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4494     /* Maximum greed exceeded */
4495     if (cur_curlyx->u.curlyx.count >= REG_INFTY
4496      && ckWARN(WARN_REGEXP)
4497      && !(PL_reg_flags & RF_warned))
4498     {
4499      PL_reg_flags |= RF_warned;
4500      Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4501       "%s limit (%d) exceeded",
4502       "Complex regular subexpression recursion",
4503       REG_INFTY - 1);
4504     }
4505     cur_curlyx->u.curlyx.count--;
4506     CACHEsayNO;
4507    }
4508
4509    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4510     "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4511    );
4512    /* Try grabbing another A and see if it helps. */
4513    PL_reginput = locinput;
4514    cur_curlyx->u.curlyx.lastloc = locinput;
4515    ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4516    REGCP_SET(ST.lastcp);
4517    PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4518    /* NOTREACHED */
4519
4520 #undef  ST
4521 #define ST st->u.branch
4522
4523   case BRANCHJ:     /*  /(...|A|...)/ with long next pointer */
4524    next = scan + ARG(scan);
4525    if (next == scan)
4526     next = NULL;
4527    scan = NEXTOPER(scan);
4528    /* FALL THROUGH */
4529
4530   case BRANCH:     /*  /(...|A|...)/ */
4531    scan = NEXTOPER(scan); /* scan now points to inner node */
4532    ST.lastparen = *PL_reglastparen;
4533    ST.next_branch = next;
4534    REGCP_SET(ST.cp);
4535    PL_reginput = locinput;
4536
4537    /* Now go into the branch */
4538    if (has_cutgroup) {
4539     PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4540    } else {
4541     PUSH_STATE_GOTO(BRANCH_next, scan);
4542    }
4543    /* NOTREACHED */
4544   case CUTGROUP:
4545    PL_reginput = locinput;
4546    sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4547     MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4548    PUSH_STATE_GOTO(CUTGROUP_next,next);
4549    /* NOTREACHED */
4550   case CUTGROUP_next_fail:
4551    do_cutgroup = 1;
4552    no_final = 1;
4553    if (st->u.mark.mark_name)
4554     sv_commit = st->u.mark.mark_name;
4555    sayNO;
4556    /* NOTREACHED */
4557   case BRANCH_next:
4558    sayYES;
4559    /* NOTREACHED */
4560   case BRANCH_next_fail: /* that branch failed; try the next, if any */
4561    if (do_cutgroup) {
4562     do_cutgroup = 0;
4563     no_final = 0;
4564    }
4565    REGCP_UNWIND(ST.cp);
4566    for (n = *PL_reglastparen; n > ST.lastparen; n--)
4567     PL_regoffs[n].end = -1;
4568    *PL_reglastparen = n;
4569    /*dmq: *PL_reglastcloseparen = n; */
4570    scan = ST.next_branch;
4571    /* no more branches? */
4572    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4573     DEBUG_EXECUTE_r({
4574      PerlIO_printf( Perl_debug_log,
4575       "%*s  %sBRANCH failed...%s\n",
4576       REPORT_CODE_OFF+depth*2, "",
4577       PL_colors[4],
4578       PL_colors[5] );
4579     });
4580     sayNO_SILENT;
4581    }
4582    continue; /* execute next BRANCH[J] op */
4583    /* NOTREACHED */
4584
4585   case MINMOD:
4586    minmod = 1;
4587    break;
4588
4589 #undef  ST
4590 #define ST st->u.curlym
4591
4592   case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4593
4594    /* This is an optimisation of CURLYX that enables us to push
4595    * only a single backtracking state, no matter how many matches
4596    * there are in {m,n}. It relies on the pattern being constant
4597    * length, with no parens to influence future backrefs
4598    */
4599
4600    ST.me = scan;
4601    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4602
4603    /* if paren positive, emulate an OPEN/CLOSE around A */
4604    if (ST.me->flags) {
4605     U32 paren = ST.me->flags;
4606     if (paren > PL_regsize)
4607      PL_regsize = paren;
4608     if (paren > *PL_reglastparen)
4609      *PL_reglastparen = paren;
4610     scan += NEXT_OFF(scan); /* Skip former OPEN. */
4611    }
4612    ST.A = scan;
4613    ST.B = next;
4614    ST.alen = 0;
4615    ST.count = 0;
4616    ST.minmod = minmod;
4617    minmod = 0;
4618    ST.c1 = CHRTEST_UNINIT;
4619    REGCP_SET(ST.cp);
4620
4621    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4622     goto curlym_do_B;
4623
4624   curlym_do_A: /* execute the A in /A{m,n}B/  */
4625    PL_reginput = locinput;
4626    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4627    /* NOTREACHED */
4628
4629   case CURLYM_A: /* we've just matched an A */
4630    locinput = st->locinput;
4631    nextchr = UCHARAT(locinput);
4632
4633    ST.count++;
4634    /* after first match, determine A's length: u.curlym.alen */
4635    if (ST.count == 1) {
4636     if (PL_reg_match_utf8) {
4637      char *s = locinput;
4638      while (s < PL_reginput) {
4639       ST.alen++;
4640       s += UTF8SKIP(s);
4641      }
4642     }
4643     else {
4644      ST.alen = PL_reginput - locinput;
4645     }
4646     if (ST.alen == 0)
4647      ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4648    }
4649    DEBUG_EXECUTE_r(
4650     PerlIO_printf(Perl_debug_log,
4651       "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4652       (int)(REPORT_CODE_OFF+(depth*2)), "",
4653       (IV) ST.count, (IV)ST.alen)
4654    );
4655
4656    locinput = PL_reginput;
4657
4658    if (cur_eval && cur_eval->u.eval.close_paren &&
4659     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4660     goto fake_end;
4661
4662    {
4663     I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4664     if ( max == REG_INFTY || ST.count < max )
4665      goto curlym_do_A; /* try to match another A */
4666    }
4667    goto curlym_do_B; /* try to match B */
4668
4669   case CURLYM_A_fail: /* just failed to match an A */
4670    REGCP_UNWIND(ST.cp);
4671
4672    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4673     || (cur_eval && cur_eval->u.eval.close_paren &&
4674      cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4675     sayNO;
4676
4677   curlym_do_B: /* execute the B in /A{m,n}B/  */
4678    PL_reginput = locinput;
4679    if (ST.c1 == CHRTEST_UNINIT) {
4680     /* calculate c1 and c2 for possible match of 1st char
4681     * following curly */
4682     ST.c1 = ST.c2 = CHRTEST_VOID;
4683     if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4684      regnode *text_node = ST.B;
4685      if (! HAS_TEXT(text_node))
4686       FIND_NEXT_IMPT(text_node);
4687      /* this used to be
4688
4689       (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4690
4691        But the former is redundant in light of the latter.
4692
4693        if this changes back then the macro for
4694        IS_TEXT and friends need to change.
4695      */
4696      if (PL_regkind[OP(text_node)] == EXACT)
4697      {
4698
4699       ST.c1 = (U8)*STRING(text_node);
4700       ST.c2 =
4701        (IS_TEXTF(text_node))
4702        ? PL_fold[ST.c1]
4703        : (IS_TEXTFL(text_node))
4704         ? PL_fold_locale[ST.c1]
4705         : ST.c1;
4706      }
4707     }
4708    }
4709
4710    DEBUG_EXECUTE_r(
4711     PerlIO_printf(Perl_debug_log,
4712      "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4713      (int)(REPORT_CODE_OFF+(depth*2)),
4714      "", (IV)ST.count)
4715     );
4716    if (ST.c1 != CHRTEST_VOID
4717      && UCHARAT(PL_reginput) != ST.c1
4718      && UCHARAT(PL_reginput) != ST.c2)
4719    {
4720     /* simulate B failing */
4721     DEBUG_OPTIMISE_r(
4722      PerlIO_printf(Perl_debug_log,
4723       "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4724       (int)(REPORT_CODE_OFF+(depth*2)),"",
4725       (IV)ST.c1,(IV)ST.c2
4726     ));
4727     state_num = CURLYM_B_fail;
4728     goto reenter_switch;
4729    }
4730
4731    if (ST.me->flags) {
4732     /* mark current A as captured */
4733     I32 paren = ST.me->flags;
4734     if (ST.count) {
4735      PL_regoffs[paren].start
4736       = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4737      PL_regoffs[paren].end = PL_reginput - PL_bostr;
4738      /*dmq: *PL_reglastcloseparen = paren; */
4739     }
4740     else
4741      PL_regoffs[paren].end = -1;
4742     if (cur_eval && cur_eval->u.eval.close_paren &&
4743      cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4744     {
4745      if (ST.count)
4746       goto fake_end;
4747      else
4748       sayNO;
4749     }
4750    }
4751
4752    PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4753    /* NOTREACHED */
4754
4755   case CURLYM_B_fail: /* just failed to match a B */
4756    REGCP_UNWIND(ST.cp);
4757    if (ST.minmod) {
4758     I32 max = ARG2(ST.me);
4759     if (max != REG_INFTY && ST.count == max)
4760      sayNO;
4761     goto curlym_do_A; /* try to match a further A */
4762    }
4763    /* backtrack one A */
4764    if (ST.count == ARG1(ST.me) /* min */)
4765     sayNO;
4766    ST.count--;
4767    locinput = HOPc(locinput, -ST.alen);
4768    goto curlym_do_B; /* try to match B */
4769
4770 #undef ST
4771 #define ST st->u.curly
4772
4773 #define CURLY_SETPAREN(paren, success) \
4774  if (paren) { \
4775   if (success) { \
4776    PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4777    PL_regoffs[paren].end = locinput - PL_bostr; \
4778    *PL_reglastcloseparen = paren; \
4779   } \
4780   else \
4781    PL_regoffs[paren].end = -1; \
4782  }
4783
4784   case STAR:  /*  /A*B/ where A is width 1 */
4785    ST.paren = 0;
4786    ST.min = 0;
4787    ST.max = REG_INFTY;
4788    scan = NEXTOPER(scan);
4789    goto repeat;
4790   case PLUS:  /*  /A+B/ where A is width 1 */
4791    ST.paren = 0;
4792    ST.min = 1;
4793    ST.max = REG_INFTY;
4794    scan = NEXTOPER(scan);
4795    goto repeat;
4796   case CURLYN:  /*  /(A){m,n}B/ where A is width 1 */
4797    ST.paren = scan->flags; /* Which paren to set */
4798    if (ST.paren > PL_regsize)
4799     PL_regsize = ST.paren;
4800    if (ST.paren > *PL_reglastparen)
4801     *PL_reglastparen = ST.paren;
4802    ST.min = ARG1(scan);  /* min to match */
4803    ST.max = ARG2(scan);  /* max to match */
4804    if (cur_eval && cur_eval->u.eval.close_paren &&
4805     cur_eval->u.eval.close_paren == (U32)ST.paren) {
4806     ST.min=1;
4807     ST.max=1;
4808    }
4809    scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4810    goto repeat;
4811   case CURLY:  /*  /A{m,n}B/ where A is width 1 */
4812    ST.paren = 0;
4813    ST.min = ARG1(scan);  /* min to match */
4814    ST.max = ARG2(scan);  /* max to match */
4815    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4816   repeat:
4817    /*
4818    * Lookahead to avoid useless match attempts
4819    * when we know what character comes next.
4820    *
4821    * Used to only do .*x and .*?x, but now it allows
4822    * for )'s, ('s and (?{ ... })'s to be in the way
4823    * of the quantifier and the EXACT-like node.  -- japhy
4824    */
4825
4826    if (ST.min > ST.max) /* XXX make this a compile-time check? */
4827     sayNO;
4828    if (HAS_TEXT(next) || JUMPABLE(next)) {
4829     U8 *s;
4830     regnode *text_node = next;
4831
4832     if (! HAS_TEXT(text_node))
4833      FIND_NEXT_IMPT(text_node);
4834
4835     if (! HAS_TEXT(text_node))
4836      ST.c1 = ST.c2 = CHRTEST_VOID;
4837     else {
4838      if ( PL_regkind[OP(text_node)] != EXACT ) {
4839       ST.c1 = ST.c2 = CHRTEST_VOID;
4840       goto assume_ok_easy;
4841      }
4842      else
4843       s = (U8*)STRING(text_node);
4844
4845      /*  Currently we only get here when
4846
4847       PL_rekind[OP(text_node)] == EXACT
4848
4849       if this changes back then the macro for IS_TEXT and
4850       friends need to change. */
4851      if (!UTF) {
4852       ST.c2 = ST.c1 = *s;
4853       if (IS_TEXTF(text_node))
4854        ST.c2 = PL_fold[ST.c1];
4855       else if (IS_TEXTFL(text_node))
4856        ST.c2 = PL_fold_locale[ST.c1];
4857      }
4858      else { /* UTF */
4859       if (IS_TEXTF(text_node)) {
4860        STRLEN ulen1, ulen2;
4861        U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4862        U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4863
4864        to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4865        to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4866 #ifdef EBCDIC
4867        ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4868              ckWARN(WARN_UTF8) ?
4869              0 : UTF8_ALLOW_ANY);
4870        ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4871              ckWARN(WARN_UTF8) ?
4872              0 : UTF8_ALLOW_ANY);
4873 #else
4874        ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4875              uniflags);
4876        ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4877              uniflags);
4878 #endif
4879       }
4880       else {
4881        ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4882              uniflags);
4883       }
4884      }
4885     }
4886    }
4887    else
4888     ST.c1 = ST.c2 = CHRTEST_VOID;
4889   assume_ok_easy:
4890
4891    ST.A = scan;
4892    ST.B = next;
4893    PL_reginput = locinput;
4894    if (minmod) {
4895     minmod = 0;
4896     if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4897      sayNO;
4898     ST.count = ST.min;
4899     locinput = PL_reginput;
4900     REGCP_SET(ST.cp);
4901     if (ST.c1 == CHRTEST_VOID)
4902      goto curly_try_B_min;
4903
4904     ST.oldloc = locinput;
4905
4906     /* set ST.maxpos to the furthest point along the
4907     * string that could possibly match */
4908     if  (ST.max == REG_INFTY) {
4909      ST.maxpos = PL_regeol - 1;
4910      if (do_utf8)
4911       while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4912        ST.maxpos--;
4913     }
4914     else if (do_utf8) {
4915      int m = ST.max - ST.min;
4916      for (ST.maxpos = locinput;
4917       m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4918       ST.maxpos += UTF8SKIP(ST.maxpos);
4919     }
4920     else {
4921      ST.maxpos = locinput + ST.max - ST.min;
4922      if (ST.maxpos >= PL_regeol)
4923       ST.maxpos = PL_regeol - 1;
4924     }
4925     goto curly_try_B_min_known;
4926
4927    }
4928    else {
4929     ST.count = regrepeat(rex, ST.A, ST.max, depth);
4930     locinput = PL_reginput;
4931     if (ST.count < ST.min)
4932      sayNO;
4933     if ((ST.count > ST.min)
4934      && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4935     {
4936      /* A{m,n} must come at the end of the string, there's
4937      * no point in backing off ... */
4938      ST.min = ST.count;
4939      /* ...except that $ and \Z can match before *and* after
4940      newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
4941      We may back off by one in this case. */
4942      if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4943       ST.min--;
4944     }
4945     REGCP_SET(ST.cp);
4946     goto curly_try_B_max;
4947    }
4948    /* NOTREACHED */
4949
4950
4951   case CURLY_B_min_known_fail:
4952    /* failed to find B in a non-greedy match where c1,c2 valid */
4953    if (ST.paren && ST.count)
4954     PL_regoffs[ST.paren].end = -1;
4955
4956    PL_reginput = locinput; /* Could be reset... */
4957    REGCP_UNWIND(ST.cp);
4958    /* Couldn't or didn't -- move forward. */
4959    ST.oldloc = locinput;
4960    if (do_utf8)
4961     locinput += UTF8SKIP(locinput);
4962    else
4963     locinput++;
4964    ST.count++;
4965   curly_try_B_min_known:
4966    /* find the next place where 'B' could work, then call B */
4967    {
4968     int n;
4969     if (do_utf8) {
4970      n = (ST.oldloc == locinput) ? 0 : 1;
4971      if (ST.c1 == ST.c2) {
4972       STRLEN len;
4973       /* set n to utf8_distance(oldloc, locinput) */
4974       while (locinput <= ST.maxpos &&
4975        utf8n_to_uvchr((U8*)locinput,
4976            UTF8_MAXBYTES, &len,
4977            uniflags) != (UV)ST.c1) {
4978        locinput += len;
4979        n++;
4980       }
4981      }
4982      else {
4983       /* set n to utf8_distance(oldloc, locinput) */
4984       while (locinput <= ST.maxpos) {
4985        STRLEN len;
4986        const UV c = utf8n_to_uvchr((U8*)locinput,
4987             UTF8_MAXBYTES, &len,
4988             uniflags);
4989        if (c == (UV)ST.c1 || c == (UV)ST.c2)
4990         break;
4991        locinput += len;
4992        n++;
4993       }
4994      }
4995     }
4996     else {
4997      if (ST.c1 == ST.c2) {
4998       while (locinput <= ST.maxpos &&
4999        UCHARAT(locinput) != ST.c1)
5000        locinput++;
5001      }
5002      else {
5003       while (locinput <= ST.maxpos
5004        && UCHARAT(locinput) != ST.c1
5005        && UCHARAT(locinput) != ST.c2)
5006        locinput++;
5007      }
5008      n = locinput - ST.oldloc;
5009     }
5010     if (locinput > ST.maxpos)
5011      sayNO;
5012     /* PL_reginput == oldloc now */
5013     if (n) {
5014      ST.count += n;
5015      if (regrepeat(rex, ST.A, n, depth) < n)
5016       sayNO;
5017     }
5018     PL_reginput = locinput;
5019     CURLY_SETPAREN(ST.paren, ST.count);
5020     if (cur_eval && cur_eval->u.eval.close_paren &&
5021      cur_eval->u.eval.close_paren == (U32)ST.paren) {
5022      goto fake_end;
5023     }
5024     PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5025    }
5026    /* NOTREACHED */
5027
5028
5029   case CURLY_B_min_fail:
5030    /* failed to find B in a non-greedy match where c1,c2 invalid */
5031    if (ST.paren && ST.count)
5032     PL_regoffs[ST.paren].end = -1;
5033
5034    REGCP_UNWIND(ST.cp);
5035    /* failed -- move forward one */
5036    PL_reginput = locinput;
5037    if (regrepeat(rex, ST.A, 1, depth)) {
5038     ST.count++;
5039     locinput = PL_reginput;
5040     if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5041       ST.count > 0)) /* count overflow ? */
5042     {
5043     curly_try_B_min:
5044      CURLY_SETPAREN(ST.paren, ST.count);
5045      if (cur_eval && cur_eval->u.eval.close_paren &&
5046       cur_eval->u.eval.close_paren == (U32)ST.paren) {
5047       goto fake_end;
5048      }
5049      PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5050     }
5051    }
5052    sayNO;
5053    /* NOTREACHED */
5054
5055
5056   curly_try_B_max:
5057    /* a successful greedy match: now try to match B */
5058    if (cur_eval && cur_eval->u.eval.close_paren &&
5059     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5060     goto fake_end;
5061    }
5062    {
5063     UV c = 0;
5064     if (ST.c1 != CHRTEST_VOID)
5065      c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
5066           UTF8_MAXBYTES, 0, uniflags)
5067         : (UV) UCHARAT(PL_reginput);
5068     /* If it could work, try it. */
5069     if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5070      CURLY_SETPAREN(ST.paren, ST.count);
5071      PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5072      /* NOTREACHED */
5073     }
5074    }
5075    /* FALL THROUGH */
5076   case CURLY_B_max_fail:
5077    /* failed to find B in a greedy match */
5078    if (ST.paren && ST.count)
5079     PL_regoffs[ST.paren].end = -1;
5080
5081    REGCP_UNWIND(ST.cp);
5082    /*  back up. */
5083    if (--ST.count < ST.min)
5084     sayNO;
5085    PL_reginput = locinput = HOPc(locinput, -1);
5086    goto curly_try_B_max;
5087
5088 #undef ST
5089
5090   case END:
5091    fake_end:
5092    if (cur_eval) {
5093     /* we've just finished A in /(??{A})B/; now continue with B */
5094     I32 tmpix;
5095     st->u.eval.toggle_reg_flags
5096        = cur_eval->u.eval.toggle_reg_flags;
5097     PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5098
5099     st->u.eval.prev_rex = rex_sv;  /* inner */
5100     SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5101     rex = (struct regexp *)SvANY(rex_sv);
5102     rexi = RXi_GET(rex);
5103     cur_curlyx = cur_eval->u.eval.prev_curlyx;
5104     ReREFCNT_inc(rex_sv);
5105     st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
5106
5107     /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5108     PL_reglastparen = &rex->lastparen;
5109     PL_reglastcloseparen = &rex->lastcloseparen;
5110
5111     REGCP_SET(st->u.eval.lastcp);
5112     PL_reginput = locinput;
5113
5114     /* Restore parens of the outer rex without popping the
5115     * savestack */
5116     tmpix = PL_savestack_ix;
5117     PL_savestack_ix = cur_eval->u.eval.lastcp;
5118     regcppop(rex);
5119     PL_savestack_ix = tmpix;
5120
5121     st->u.eval.prev_eval = cur_eval;
5122     cur_eval = cur_eval->u.eval.prev_eval;
5123     DEBUG_EXECUTE_r(
5124      PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5125          REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5126     if ( nochange_depth )
5127      nochange_depth--;
5128
5129     PUSH_YES_STATE_GOTO(EVAL_AB,
5130       st->u.eval.prev_eval->u.eval.B); /* match B */
5131    }
5132
5133    if (locinput < reginfo->till) {
5134     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5135          "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5136          PL_colors[4],
5137          (long)(locinput - PL_reg_starttry),
5138          (long)(reginfo->till - PL_reg_starttry),
5139          PL_colors[5]));
5140
5141     sayNO_SILENT;  /* Cannot match: too short. */
5142    }
5143    PL_reginput = locinput; /* put where regtry can find it */
5144    sayYES;   /* Success! */
5145
5146   case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5147    DEBUG_EXECUTE_r(
5148    PerlIO_printf(Perl_debug_log,
5149     "%*s  %ssubpattern success...%s\n",
5150     REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5151    PL_reginput = locinput; /* put where regtry can find it */
5152    sayYES;   /* Success! */
5153
5154 #undef  ST
5155 #define ST st->u.ifmatch
5156
5157   case SUSPEND: /* (?>A) */
5158    ST.wanted = 1;
5159    PL_reginput = locinput;
5160    goto do_ifmatch;
5161
5162   case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5163    ST.wanted = 0;
5164    goto ifmatch_trivial_fail_test;
5165
5166   case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5167    ST.wanted = 1;
5168   ifmatch_trivial_fail_test:
5169    if (scan->flags) {
5170     char * const s = HOPBACKc(locinput, scan->flags);
5171     if (!s) {
5172      /* trivial fail */
5173      if (logical) {
5174       logical = 0;
5175       sw = 1 - (bool)ST.wanted;
5176      }
5177      else if (ST.wanted)
5178       sayNO;
5179      next = scan + ARG(scan);
5180      if (next == scan)
5181       next = NULL;
5182      break;
5183     }
5184     PL_reginput = s;
5185    }
5186    else
5187     PL_reginput = locinput;
5188
5189   do_ifmatch:
5190    ST.me = scan;
5191    ST.logical = logical;
5192    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5193
5194    /* execute body of (?...A) */
5195    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5196    /* NOTREACHED */
5197
5198   case IFMATCH_A_fail: /* body of (?...A) failed */
5199    ST.wanted = !ST.wanted;
5200    /* FALL THROUGH */
5201
5202   case IFMATCH_A: /* body of (?...A) succeeded */
5203    if (ST.logical) {
5204     sw = (bool)ST.wanted;
5205    }
5206    else if (!ST.wanted)
5207     sayNO;
5208
5209    if (OP(ST.me) == SUSPEND)
5210     locinput = PL_reginput;
5211    else {
5212     locinput = PL_reginput = st->locinput;
5213     nextchr = UCHARAT(locinput);
5214    }
5215    scan = ST.me + ARG(ST.me);
5216    if (scan == ST.me)
5217     scan = NULL;
5218    continue; /* execute B */
5219
5220 #undef ST
5221
5222   case LONGJMP:
5223    next = scan + ARG(scan);
5224    if (next == scan)
5225     next = NULL;
5226    break;
5227   case COMMIT:
5228    reginfo->cutpoint = PL_regeol;
5229    /* FALLTHROUGH */
5230   case PRUNE:
5231    PL_reginput = locinput;
5232    if (!scan->flags)
5233     sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5234    PUSH_STATE_GOTO(COMMIT_next,next);
5235    /* NOTREACHED */
5236   case COMMIT_next_fail:
5237    no_final = 1;
5238    /* FALLTHROUGH */
5239   case OPFAIL:
5240    sayNO;
5241    /* NOTREACHED */
5242
5243 #define ST st->u.mark
5244   case MARKPOINT:
5245    ST.prev_mark = mark_state;
5246    ST.mark_name = sv_commit = sv_yes_mark
5247     = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5248    mark_state = st;
5249    ST.mark_loc = PL_reginput = locinput;
5250    PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5251    /* NOTREACHED */
5252   case MARKPOINT_next:
5253    mark_state = ST.prev_mark;
5254    sayYES;
5255    /* NOTREACHED */
5256   case MARKPOINT_next_fail:
5257    if (popmark && sv_eq(ST.mark_name,popmark))
5258    {
5259     if (ST.mark_loc > startpoint)
5260      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5261     popmark = NULL; /* we found our mark */
5262     sv_commit = ST.mark_name;
5263
5264     DEBUG_EXECUTE_r({
5265       PerlIO_printf(Perl_debug_log,
5266        "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5267        REPORT_CODE_OFF+depth*2, "",
5268        PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5269     });
5270    }
5271    mark_state = ST.prev_mark;
5272    sv_yes_mark = mark_state ?
5273     mark_state->u.mark.mark_name : NULL;
5274    sayNO;
5275    /* NOTREACHED */
5276   case SKIP:
5277    PL_reginput = locinput;
5278    if (scan->flags) {
5279     /* (*SKIP) : if we fail we cut here*/
5280     ST.mark_name = NULL;
5281     ST.mark_loc = locinput;
5282     PUSH_STATE_GOTO(SKIP_next,next);
5283    } else {
5284     /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5285     otherwise do nothing.  Meaning we need to scan
5286     */
5287     regmatch_state *cur = mark_state;
5288     SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5289
5290     while (cur) {
5291      if ( sv_eq( cur->u.mark.mark_name,
5292         find ) )
5293      {
5294       ST.mark_name = find;
5295       PUSH_STATE_GOTO( SKIP_next, next );
5296      }
5297      cur = cur->u.mark.prev_mark;
5298     }
5299    }
5300    /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5301    break;
5302   case SKIP_next_fail:
5303    if (ST.mark_name) {
5304     /* (*CUT:NAME) - Set up to search for the name as we
5305     collapse the stack*/
5306     popmark = ST.mark_name;
5307    } else {
5308     /* (*CUT) - No name, we cut here.*/
5309     if (ST.mark_loc > startpoint)
5310      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5311     /* but we set sv_commit to latest mark_name if there
5312     is one so they can test to see how things lead to this
5313     cut */
5314     if (mark_state)
5315      sv_commit=mark_state->u.mark.mark_name;
5316    }
5317    no_final = 1;
5318    sayNO;
5319    /* NOTREACHED */
5320 #undef ST
5321   case FOLDCHAR:
5322    n = ARG(scan);
5323    if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5324     locinput += ln;
5325    } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5326     sayNO;
5327    } else  {
5328     U8 folded[UTF8_MAXBYTES_CASE+1];
5329     STRLEN foldlen;
5330     const char * const l = locinput;
5331     char *e = PL_regeol;
5332     to_uni_fold(n, folded, &foldlen);
5333
5334     if (ibcmp_utf8((const char*) folded, 0,  foldlen, 1,
5335        l, &e, 0,  do_utf8)) {
5336       sayNO;
5337     }
5338     locinput = e;
5339    }
5340    nextchr = UCHARAT(locinput);
5341    break;
5342   case LNBREAK:
5343    if ((n=is_LNBREAK(locinput,do_utf8))) {
5344     locinput += n;
5345     nextchr = UCHARAT(locinput);
5346    } else
5347     sayNO;
5348    break;
5349
5350 #define CASE_CLASS(nAmE)                              \
5351   case nAmE:                                    \
5352    if ((n=is_##nAmE(locinput,do_utf8))) {    \
5353     locinput += n;                        \
5354     nextchr = UCHARAT(locinput);          \
5355    } else                                    \
5356     sayNO;                                \
5357    break;                                    \
5358   case N##nAmE:                                 \
5359    if ((n=is_##nAmE(locinput,do_utf8))) {    \
5360     sayNO;                                \
5361    } else {                                  \
5362     locinput += UTF8SKIP(locinput);       \
5363     nextchr = UCHARAT(locinput);          \
5364    }                                         \
5365    break
5366
5367   CASE_CLASS(VERTWS);
5368   CASE_CLASS(HORIZWS);
5369 #undef CASE_CLASS
5370
5371   default:
5372    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5373       PTR2UV(scan), OP(scan));
5374    Perl_croak(aTHX_ "regexp memory corruption");
5375
5376   } /* end switch */
5377
5378   /* switch break jumps here */
5379   scan = next; /* prepare to execute the next op and ... */
5380   continue;    /* ... jump back to the top, reusing st */
5381   /* NOTREACHED */
5382
5383  push_yes_state:
5384   /* push a state that backtracks on success */
5385   st->u.yes.prev_yes_state = yes_state;
5386   yes_state = st;
5387   /* FALL THROUGH */
5388  push_state:
5389   /* push a new regex state, then continue at scan  */
5390   {
5391    regmatch_state *newst;
5392
5393    DEBUG_STACK_r({
5394     regmatch_state *cur = st;
5395     regmatch_state *curyes = yes_state;
5396     int curd = depth;
5397     regmatch_slab *slab = PL_regmatch_slab;
5398     for (;curd > -1;cur--,curd--) {
5399      if (cur < SLAB_FIRST(slab)) {
5400       slab = slab->prev;
5401       cur = SLAB_LAST(slab);
5402      }
5403      PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5404       REPORT_CODE_OFF + 2 + depth * 2,"",
5405       curd, PL_reg_name[cur->resume_state],
5406       (curyes == cur) ? "yes" : ""
5407      );
5408      if (curyes == cur)
5409       curyes = cur->u.yes.prev_yes_state;
5410     }
5411    } else
5412     DEBUG_STATE_pp("push")
5413    );
5414    depth++;
5415    st->locinput = locinput;
5416    newst = st+1;
5417    if (newst >  SLAB_LAST(PL_regmatch_slab))
5418     newst = S_push_slab(aTHX);
5419    PL_regmatch_state = newst;
5420
5421    locinput = PL_reginput;
5422    nextchr = UCHARAT(locinput);
5423    st = newst;
5424    continue;
5425    /* NOTREACHED */
5426   }
5427  }
5428
5429  /*
5430  * We get here only if there's trouble -- normally "case END" is
5431  * the terminating point.
5432  */
5433  Perl_croak(aTHX_ "corrupted regexp pointers");
5434  /*NOTREACHED*/
5435  sayNO;
5436
5437 yes:
5438  if (yes_state) {
5439   /* we have successfully completed a subexpression, but we must now
5440   * pop to the state marked by yes_state and continue from there */
5441   assert(st != yes_state);
5442 #ifdef DEBUGGING
5443   while (st != yes_state) {
5444    st--;
5445    if (st < SLAB_FIRST(PL_regmatch_slab)) {
5446     PL_regmatch_slab = PL_regmatch_slab->prev;
5447     st = SLAB_LAST(PL_regmatch_slab);
5448    }
5449    DEBUG_STATE_r({
5450     if (no_final) {
5451      DEBUG_STATE_pp("pop (no final)");
5452     } else {
5453      DEBUG_STATE_pp("pop (yes)");
5454     }
5455    });
5456    depth--;
5457   }
5458 #else
5459   while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5460    || yes_state > SLAB_LAST(PL_regmatch_slab))
5461   {
5462    /* not in this slab, pop slab */
5463    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5464    PL_regmatch_slab = PL_regmatch_slab->prev;
5465    st = SLAB_LAST(PL_regmatch_slab);
5466   }
5467   depth -= (st - yes_state);
5468 #endif
5469   st = yes_state;
5470   yes_state = st->u.yes.prev_yes_state;
5471   PL_regmatch_state = st;
5472
5473   if (no_final) {
5474    locinput= st->locinput;
5475    nextchr = UCHARAT(locinput);
5476   }
5477   state_num = st->resume_state + no_final;
5478   goto reenter_switch;
5479  }
5480
5481  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5482       PL_colors[4], PL_colors[5]));
5483
5484  if (PL_reg_eval_set) {
5485   /* each successfully executed (?{...}) block does the equivalent of
5486   *   local $^R = do {...}
5487   * When popping the save stack, all these locals would be undone;
5488   * bypass this by setting the outermost saved $^R to the latest
5489   * value */
5490   if (oreplsv != GvSV(PL_replgv))
5491    sv_setsv(oreplsv, GvSV(PL_replgv));
5492  }
5493  result = 1;
5494  goto final_exit;
5495
5496 no:
5497  DEBUG_EXECUTE_r(
5498   PerlIO_printf(Perl_debug_log,
5499    "%*s  %sfailed...%s\n",
5500    REPORT_CODE_OFF+depth*2, "",
5501    PL_colors[4], PL_colors[5])
5502   );
5503
5504 no_silent:
5505  if (no_final) {
5506   if (yes_state) {
5507    goto yes;
5508   } else {
5509    goto final_exit;
5510   }
5511  }
5512  if (depth) {
5513   /* there's a previous state to backtrack to */
5514   st--;
5515   if (st < SLAB_FIRST(PL_regmatch_slab)) {
5516    PL_regmatch_slab = PL_regmatch_slab->prev;
5517    st = SLAB_LAST(PL_regmatch_slab);
5518   }
5519   PL_regmatch_state = st;
5520   locinput= st->locinput;
5521   nextchr = UCHARAT(locinput);
5522
5523   DEBUG_STATE_pp("pop");
5524   depth--;
5525   if (yes_state == st)
5526    yes_state = st->u.yes.prev_yes_state;
5527
5528   state_num = st->resume_state + 1; /* failure = success + 1 */
5529   goto reenter_switch;
5530  }
5531  result = 0;
5532
5533   final_exit:
5534  if (rex->intflags & PREGf_VERBARG_SEEN) {
5535   SV *sv_err = get_sv("REGERROR", 1);
5536   SV *sv_mrk = get_sv("REGMARK", 1);
5537   if (result) {
5538    sv_commit = &PL_sv_no;
5539    if (!sv_yes_mark)
5540     sv_yes_mark = &PL_sv_yes;
5541   } else {
5542    if (!sv_commit)
5543     sv_commit = &PL_sv_yes;
5544    sv_yes_mark = &PL_sv_no;
5545   }
5546   sv_setsv(sv_err, sv_commit);
5547   sv_setsv(sv_mrk, sv_yes_mark);
5548  }
5549
5550  /* clean up; in particular, free all slabs above current one */
5551  LEAVE_SCOPE(oldsave);
5552
5553  return result;
5554 }
5555
5556 /*
5557  - regrepeat - repeatedly match something simple, report how many
5558  */
5559 /*
5560  * [This routine now assumes that it will only match on things of length 1.
5561  * That was true before, but now we assume scan - reginput is the count,
5562  * rather than incrementing count on every character.  [Er, except utf8.]]
5563  */
5564 STATIC I32
5565 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5566 {
5567  dVAR;
5568  register char *scan;
5569  register I32 c;
5570  register char *loceol = PL_regeol;
5571  register I32 hardcount = 0;
5572  register bool do_utf8 = PL_reg_match_utf8;
5573 #ifndef DEBUGGING
5574  PERL_UNUSED_ARG(depth);
5575 #endif
5576
5577  PERL_ARGS_ASSERT_REGREPEAT;
5578
5579  scan = PL_reginput;
5580  if (max == REG_INFTY)
5581   max = I32_MAX;
5582  else if (max < loceol - scan)
5583   loceol = scan + max;
5584  switch (OP(p)) {
5585  case REG_ANY:
5586   if (do_utf8) {
5587    loceol = PL_regeol;
5588    while (scan < loceol && hardcount < max && *scan != '\n') {
5589     scan += UTF8SKIP(scan);
5590     hardcount++;
5591    }
5592   } else {
5593    while (scan < loceol && *scan != '\n')
5594     scan++;
5595   }
5596   break;
5597  case SANY:
5598   if (do_utf8) {
5599    loceol = PL_regeol;
5600    while (scan < loceol && hardcount < max) {
5601     scan += UTF8SKIP(scan);
5602     hardcount++;
5603    }
5604   }
5605   else
5606    scan = loceol;
5607   break;
5608  case CANY:
5609   scan = loceol;
5610   break;
5611  case EXACT:  /* length of string is 1 */
5612   c = (U8)*STRING(p);
5613   while (scan < loceol && UCHARAT(scan) == c)
5614    scan++;
5615   break;
5616  case EXACTF: /* length of string is 1 */
5617   c = (U8)*STRING(p);
5618   while (scan < loceol &&
5619    (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5620    scan++;
5621   break;
5622  case EXACTFL: /* length of string is 1 */
5623   PL_reg_flags |= RF_tainted;
5624   c = (U8)*STRING(p);
5625   while (scan < loceol &&
5626    (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5627    scan++;
5628   break;
5629  case ANYOF:
5630   if (do_utf8) {
5631    loceol = PL_regeol;
5632    while (hardcount < max && scan < loceol &&
5633     reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5634     scan += UTF8SKIP(scan);
5635     hardcount++;
5636    }
5637   } else {
5638    while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5639     scan++;
5640   }
5641   break;
5642  case ALNUM:
5643   if (do_utf8) {
5644    loceol = PL_regeol;
5645    LOAD_UTF8_CHARCLASS_ALNUM();
5646    while (hardcount < max && scan < loceol &&
5647     swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5648     scan += UTF8SKIP(scan);
5649     hardcount++;
5650    }
5651   } else {
5652    while (scan < loceol && isALNUM(*scan))
5653     scan++;
5654   }
5655   break;
5656  case ALNUML:
5657   PL_reg_flags |= RF_tainted;
5658   if (do_utf8) {
5659    loceol = PL_regeol;
5660    while (hardcount < max && scan < loceol &&
5661     isALNUM_LC_utf8((U8*)scan)) {
5662     scan += UTF8SKIP(scan);
5663     hardcount++;
5664    }
5665   } else {
5666    while (scan < loceol && isALNUM_LC(*scan))
5667     scan++;
5668   }
5669   break;
5670  case NALNUM:
5671   if (do_utf8) {
5672    loceol = PL_regeol;
5673    LOAD_UTF8_CHARCLASS_ALNUM();
5674    while (hardcount < max && scan < loceol &&
5675     !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5676     scan += UTF8SKIP(scan);
5677     hardcount++;
5678    }
5679   } else {
5680    while (scan < loceol && !isALNUM(*scan))
5681     scan++;
5682   }
5683   break;
5684  case NALNUML:
5685   PL_reg_flags |= RF_tainted;
5686   if (do_utf8) {
5687    loceol = PL_regeol;
5688    while (hardcount < max && scan < loceol &&
5689     !isALNUM_LC_utf8((U8*)scan)) {
5690     scan += UTF8SKIP(scan);
5691     hardcount++;
5692    }
5693   } else {
5694    while (scan < loceol && !isALNUM_LC(*scan))
5695     scan++;
5696   }
5697   break;
5698  case SPACE:
5699   if (do_utf8) {
5700    loceol = PL_regeol;
5701    LOAD_UTF8_CHARCLASS_SPACE();
5702    while (hardcount < max && scan < loceol &&
5703     (*scan == ' ' ||
5704      swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5705     scan += UTF8SKIP(scan);
5706     hardcount++;
5707    }
5708   } else {
5709    while (scan < loceol && isSPACE(*scan))
5710     scan++;
5711   }
5712   break;
5713  case SPACEL:
5714   PL_reg_flags |= RF_tainted;
5715   if (do_utf8) {
5716    loceol = PL_regeol;
5717    while (hardcount < max && scan < loceol &&
5718     (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5719     scan += UTF8SKIP(scan);
5720     hardcount++;
5721    }
5722   } else {
5723    while (scan < loceol && isSPACE_LC(*scan))
5724     scan++;
5725   }
5726   break;
5727  case NSPACE:
5728   if (do_utf8) {
5729    loceol = PL_regeol;
5730    LOAD_UTF8_CHARCLASS_SPACE();
5731    while (hardcount < max && scan < loceol &&
5732     !(*scan == ' ' ||
5733      swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5734     scan += UTF8SKIP(scan);
5735     hardcount++;
5736    }
5737   } else {
5738    while (scan < loceol && !isSPACE(*scan))
5739     scan++;
5740   }
5741   break;
5742  case NSPACEL:
5743   PL_reg_flags |= RF_tainted;
5744   if (do_utf8) {
5745    loceol = PL_regeol;
5746    while (hardcount < max && scan < loceol &&
5747     !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5748     scan += UTF8SKIP(scan);
5749     hardcount++;
5750    }
5751   } else {
5752    while (scan < loceol && !isSPACE_LC(*scan))
5753     scan++;
5754   }
5755   break;
5756  case DIGIT:
5757   if (do_utf8) {
5758    loceol = PL_regeol;
5759    LOAD_UTF8_CHARCLASS_DIGIT();
5760    while (hardcount < max && scan < loceol &&
5761     swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5762     scan += UTF8SKIP(scan);
5763     hardcount++;
5764    }
5765   } else {
5766    while (scan < loceol && isDIGIT(*scan))
5767     scan++;
5768   }
5769   break;
5770  case NDIGIT:
5771   if (do_utf8) {
5772    loceol = PL_regeol;
5773    LOAD_UTF8_CHARCLASS_DIGIT();
5774    while (hardcount < max && scan < loceol &&
5775     !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5776     scan += UTF8SKIP(scan);
5777     hardcount++;
5778    }
5779   } else {
5780    while (scan < loceol && !isDIGIT(*scan))
5781     scan++;
5782   }
5783  case LNBREAK:
5784   if (do_utf8) {
5785    loceol = PL_regeol;
5786    while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5787     scan += c;
5788     hardcount++;
5789    }
5790   } else {
5791    /*
5792    LNBREAK can match two latin chars, which is ok,
5793    because we have a null terminated string, but we
5794    have to use hardcount in this situation
5795    */
5796    while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
5797     scan+=c;
5798     hardcount++;
5799    }
5800   }
5801   break;
5802  case HORIZWS:
5803   if (do_utf8) {
5804    loceol = PL_regeol;
5805    while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5806     scan += c;
5807     hardcount++;
5808    }
5809   } else {
5810    while (scan < loceol && is_HORIZWS_latin1(scan))
5811     scan++;
5812   }
5813   break;
5814  case NHORIZWS:
5815   if (do_utf8) {
5816    loceol = PL_regeol;
5817    while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5818     scan += UTF8SKIP(scan);
5819     hardcount++;
5820    }
5821   } else {
5822    while (scan < loceol && !is_HORIZWS_latin1(scan))
5823     scan++;
5824
5825   }
5826   break;
5827  case VERTWS:
5828   if (do_utf8) {
5829    loceol = PL_regeol;
5830    while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5831     scan += c;
5832     hardcount++;
5833    }
5834   } else {
5835    while (scan < loceol && is_VERTWS_latin1(scan))
5836     scan++;
5837
5838   }
5839   break;
5840  case NVERTWS:
5841   if (do_utf8) {
5842    loceol = PL_regeol;
5843    while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5844     scan += UTF8SKIP(scan);
5845     hardcount++;
5846    }
5847   } else {
5848    while (scan < loceol && !is_VERTWS_latin1(scan))
5849     scan++;
5850
5851   }
5852   break;
5853
5854  default:  /* Called on something of 0 width. */
5855   break;  /* So match right here or not at all. */
5856  }
5857
5858  if (hardcount)
5859   c = hardcount;
5860  else
5861   c = scan - PL_reginput;
5862  PL_reginput = scan;
5863
5864  DEBUG_r({
5865   GET_RE_DEBUG_FLAGS_DECL;
5866   DEBUG_EXECUTE_r({
5867    SV * const prop = sv_newmortal();
5868    regprop(prog, prop, p);
5869    PerlIO_printf(Perl_debug_log,
5870       "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
5871       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5872   });
5873  });
5874
5875  return(c);
5876 }
5877
5878
5879 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5880 /*
5881 - regclass_swash - prepare the utf8 swash
5882 */
5883
5884 SV *
5885 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5886 {
5887  dVAR;
5888  SV *sw  = NULL;
5889  SV *si  = NULL;
5890  SV *alt = NULL;
5891  RXi_GET_DECL(prog,progi);
5892  const struct reg_data * const data = prog ? progi->data : NULL;
5893
5894  PERL_ARGS_ASSERT_REGCLASS_SWASH;
5895
5896  if (data && data->count) {
5897   const U32 n = ARG(node);
5898
5899   if (data->what[n] == 's') {
5900    SV * const rv = MUTABLE_SV(data->data[n]);
5901    AV * const av = MUTABLE_AV(SvRV(rv));
5902    SV **const ary = AvARRAY(av);
5903    SV **a, **b;
5904
5905    /* See the end of regcomp.c:S_regclass() for
5906    * documentation of these array elements. */
5907
5908    si = *ary;
5909    a  = SvROK(ary[1]) ? &ary[1] : NULL;
5910    b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5911
5912    if (a)
5913     sw = *a;
5914    else if (si && doinit) {
5915     sw = swash_init("utf8", "", si, 1, 0);
5916     (void)av_store(av, 1, sw);
5917    }
5918    if (b)
5919     alt = *b;
5920   }
5921  }
5922
5923  if (listsvp)
5924   *listsvp = si;
5925  if (altsvp)
5926   *altsvp  = alt;
5927
5928  return sw;
5929 }
5930 #endif
5931
5932 /*
5933  - reginclass - determine if a character falls into a character class
5934
5935   The n is the ANYOF regnode, the p is the target string, lenp
5936   is pointer to the maximum length of how far to go in the p
5937   (if the lenp is zero, UTF8SKIP(p) is used),
5938   do_utf8 tells whether the target string is in UTF-8.
5939
5940  */
5941
5942 STATIC bool
5943 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5944 {
5945  dVAR;
5946  const char flags = ANYOF_FLAGS(n);
5947  bool match = FALSE;
5948  UV c = *p;
5949  STRLEN len = 0;
5950  STRLEN plen;
5951
5952  PERL_ARGS_ASSERT_REGINCLASS;
5953
5954  if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5955   c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5956     (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
5957     | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
5958     /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
5959     * UTF8_ALLOW_FFFF */
5960   if (len == (STRLEN)-1)
5961    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5962  }
5963
5964  plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5965  if (do_utf8 || (flags & ANYOF_UNICODE)) {
5966   if (lenp)
5967    *lenp = 0;
5968   if (do_utf8 && !ANYOF_RUNTIME(n)) {
5969    if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5970     match = TRUE;
5971   }
5972   if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5973    match = TRUE;
5974   if (!match) {
5975    AV *av;
5976    SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5977
5978    if (sw) {
5979     U8 * utf8_p;
5980     if (do_utf8) {
5981      utf8_p = (U8 *) p;
5982     } else {
5983      STRLEN len = 1;
5984      utf8_p = bytes_to_utf8(p, &len);
5985     }
5986     if (swash_fetch(sw, utf8_p, 1))
5987      match = TRUE;
5988     else if (flags & ANYOF_FOLD) {
5989      if (!match && lenp && av) {
5990       I32 i;
5991       for (i = 0; i <= av_len(av); i++) {
5992        SV* const sv = *av_fetch(av, i, FALSE);
5993        STRLEN len;
5994        const char * const s = SvPV_const(sv, len);
5995        if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
5996         *lenp = len;
5997         match = TRUE;
5998         break;
5999        }
6000       }
6001      }
6002      if (!match) {
6003       U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
6004
6005       STRLEN tmplen;
6006       to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6007       if (swash_fetch(sw, tmpbuf, 1))
6008        match = TRUE;
6009      }
6010     }
6011
6012     /* If we allocated a string above, free it */
6013     if (! do_utf8) Safefree(utf8_p);
6014    }
6015   }
6016   if (match && lenp && *lenp == 0)
6017    *lenp = UNISKIP(NATIVE_TO_UNI(c));
6018  }
6019  if (!match && c < 256) {
6020   if (ANYOF_BITMAP_TEST(n, c))
6021    match = TRUE;
6022   else if (flags & ANYOF_FOLD) {
6023    U8 f;
6024
6025    if (flags & ANYOF_LOCALE) {
6026     PL_reg_flags |= RF_tainted;
6027     f = PL_fold_locale[c];
6028    }
6029    else
6030     f = PL_fold[c];
6031    if (f != c && ANYOF_BITMAP_TEST(n, f))
6032     match = TRUE;
6033   }
6034
6035   if (!match && (flags & ANYOF_CLASS)) {
6036    PL_reg_flags |= RF_tainted;
6037    if (
6038     (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6039     (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6040     (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6041     (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6042     (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6043     (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6044     (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6045     (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6046     (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6047     (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6048     (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6049     (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6050     (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6051     (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6052     (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6053     (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6054     (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6055     (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6056     (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6057     (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6058     (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6059     (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6060     (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6061     (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6062     (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6063     (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6064     (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6065     (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6066     (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6067     (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6068     ) /* How's that for a conditional? */
6069    {
6070     match = TRUE;
6071    }
6072   }
6073  }
6074
6075  return (flags & ANYOF_INVERT) ? !match : match;
6076 }
6077
6078 STATIC U8 *
6079 S_reghop3(U8 *s, I32 off, const U8* lim)
6080 {
6081  dVAR;
6082
6083  PERL_ARGS_ASSERT_REGHOP3;
6084
6085  if (off >= 0) {
6086   while (off-- && s < lim) {
6087    /* XXX could check well-formedness here */
6088    s += UTF8SKIP(s);
6089   }
6090  }
6091  else {
6092   while (off++ && s > lim) {
6093    s--;
6094    if (UTF8_IS_CONTINUED(*s)) {
6095     while (s > lim && UTF8_IS_CONTINUATION(*s))
6096      s--;
6097    }
6098    /* XXX could check well-formedness here */
6099   }
6100  }
6101  return s;
6102 }
6103
6104 #ifdef XXX_dmq
6105 /* there are a bunch of places where we use two reghop3's that should
6106    be replaced with this routine. but since thats not done yet
6107    we ifdef it out - dmq
6108 */
6109 STATIC U8 *
6110 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6111 {
6112  dVAR;
6113
6114  PERL_ARGS_ASSERT_REGHOP4;
6115
6116  if (off >= 0) {
6117   while (off-- && s < rlim) {
6118    /* XXX could check well-formedness here */
6119    s += UTF8SKIP(s);
6120   }
6121  }
6122  else {
6123   while (off++ && s > llim) {
6124    s--;
6125    if (UTF8_IS_CONTINUED(*s)) {
6126     while (s > llim && UTF8_IS_CONTINUATION(*s))
6127      s--;
6128    }
6129    /* XXX could check well-formedness here */
6130   }
6131  }
6132  return s;
6133 }
6134 #endif
6135
6136 STATIC U8 *
6137 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6138 {
6139  dVAR;
6140
6141  PERL_ARGS_ASSERT_REGHOPMAYBE3;
6142
6143  if (off >= 0) {
6144   while (off-- && s < lim) {
6145    /* XXX could check well-formedness here */
6146    s += UTF8SKIP(s);
6147   }
6148   if (off >= 0)
6149    return NULL;
6150  }
6151  else {
6152   while (off++ && s > lim) {
6153    s--;
6154    if (UTF8_IS_CONTINUED(*s)) {
6155     while (s > lim && UTF8_IS_CONTINUATION(*s))
6156      s--;
6157    }
6158    /* XXX could check well-formedness here */
6159   }
6160   if (off <= 0)
6161    return NULL;
6162  }
6163  return s;
6164 }
6165
6166 static void
6167 restore_pos(pTHX_ void *arg)
6168 {
6169  dVAR;
6170  regexp * const rex = (regexp *)arg;
6171  if (PL_reg_eval_set) {
6172   if (PL_reg_oldsaved) {
6173    rex->subbeg = PL_reg_oldsaved;
6174    rex->sublen = PL_reg_oldsavedlen;
6175 #ifdef PERL_OLD_COPY_ON_WRITE
6176    rex->saved_copy = PL_nrs;
6177 #endif
6178    RXp_MATCH_COPIED_on(rex);
6179   }
6180   PL_reg_magic->mg_len = PL_reg_oldpos;
6181   PL_reg_eval_set = 0;
6182   PL_curpm = PL_reg_oldcurpm;
6183  }
6184 }
6185
6186 STATIC void
6187 S_to_utf8_substr(pTHX_ register regexp *prog)
6188 {
6189  int i = 1;
6190
6191  PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6192
6193  do {
6194   if (prog->substrs->data[i].substr
6195    && !prog->substrs->data[i].utf8_substr) {
6196    SV* const sv = newSVsv(prog->substrs->data[i].substr);
6197    prog->substrs->data[i].utf8_substr = sv;
6198    sv_utf8_upgrade(sv);
6199    if (SvVALID(prog->substrs->data[i].substr)) {
6200     const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6201     if (flags & FBMcf_TAIL) {
6202      /* Trim the trailing \n that fbm_compile added last
6203      time.  */
6204      SvCUR_set(sv, SvCUR(sv) - 1);
6205      /* Whilst this makes the SV technically "invalid" (as its
6206      buffer is no longer followed by "\0") when fbm_compile()
6207      adds the "\n" back, a "\0" is restored.  */
6208     }
6209     fbm_compile(sv, flags);
6210    }
6211    if (prog->substrs->data[i].substr == prog->check_substr)
6212     prog->check_utf8 = sv;
6213   }
6214  } while (i--);
6215 }
6216
6217 STATIC void
6218 S_to_byte_substr(pTHX_ register regexp *prog)
6219 {
6220  dVAR;
6221  int i = 1;
6222
6223  PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6224
6225  do {
6226   if (prog->substrs->data[i].utf8_substr
6227    && !prog->substrs->data[i].substr) {
6228    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6229    if (sv_utf8_downgrade(sv, TRUE)) {
6230     if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6231      const U8 flags
6232       = BmFLAGS(prog->substrs->data[i].utf8_substr);
6233      if (flags & FBMcf_TAIL) {
6234       /* Trim the trailing \n that fbm_compile added last
6235       time.  */
6236       SvCUR_set(sv, SvCUR(sv) - 1);
6237      }
6238      fbm_compile(sv, flags);
6239     }
6240    } else {
6241     SvREFCNT_dec(sv);
6242     sv = &PL_sv_undef;
6243    }
6244    prog->substrs->data[i].substr = sv;
6245    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6246     prog->check_substr = sv;
6247   }
6248  } while (i--);
6249 }
6250
6251 /*
6252  * Local variables:
6253  * c-indentation-style: bsd
6254  * c-basic-offset: 4
6255  * indent-tabs-mode: t
6256  * End:
6257  *
6258  * ex: set ts=8 sts=4 sw=4 noet:
6259  */