src/5012005/regexec.c

   1 /*    regexec.c
   2  */
   3
   4 /*
   5  *  One Ring to rule them all, One Ring to find them
   6  &
   7  *     [p.v of _The Lord of the Rings_, opening poem]
   8  *     [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
   9  *     [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
  10  */
  11
  12 /* This file contains functions for executing a regular expression.  See
  13  * also regcomp.c which funnily enough, contains functions for compiling
  14  * a regular expression.
  15  *
  16  * This file is also copied at build time to ext/re/re_exec.c, where
  17  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
  18  * This causes the main functions to be compiled under new names and with
  19  * debugging support added, which makes "use re 'debug'" work.
  20  */
  21
  22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
  23  * confused with the original package (see point 3 below).  Thanks, Henry!
  24  */
  25
  26 /* Additional note: this code is very heavily munged from Henry's version
  27  * in places.  In some spots I've traded clarity for efficiency, so don't
  28  * blame Henry for some of the lack of readability.
  29  */
  30
  31 /* The names of the functions have been changed from regcomp and
  32  * regexec to  pregcomp and pregexec in order to avoid conflicts
  33  * with the POSIX routines of the same names.
  34 */
  35
  36 #ifdef PERL_EXT_RE_BUILD
  37 #include "re_top.h"
  38 #endif
  39
  40 /*
  41  * pregcomp and pregexec -- regsub and regerror are not used in perl
  42  *
  43  * Copyright (c) 1986 by University of Toronto.
  44  * Written by Henry Spencer.  Not derived from licensed software.
  45  *
  46  * Permission is granted to anyone to use this software for any
  47  * purpose on any computer system, and to redistribute it freely,
  48  * subject to the following restrictions:
  49  *
  50  * 1. The author is not responsible for the consequences of use of
  51  *  this software, no matter how awful, even if they arise
  52  *  from defects in it.
  53  *
  54  * 2. The origin of this software must not be misrepresented, either
  55  *  by explicit claim or by omission.
  56  *
  57  * 3. Altered versions must be plainly marked as such, and must not
  58  *  be misrepresented as being the original software.
  59  *
  60  ****    Alterations to Henry's code are...
  61  ****
  62  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
  63  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
  64  ****    by Larry Wall and others
  65  ****
  66  ****    You may distribute under the terms of either the GNU General Public
  67  ****    License or the Artistic License, as specified in the README file.
  68  *
  69  * Beware that some of this code is subtly aware of the way operator
  70  * precedence is structured in regular expressions.  Serious changes in
  71  * regular-expression syntax might require a total rethink.
  72  */
  73 #include "EXTERN.h"
  74 #define PERL_IN_REGEXEC_C
  75 #undef PERL_IN_XSUB_RE
  76 #define PERL_IN_XSUB_RE 1
  77 #include "perl.h"
  78 #include "re_defs.h"
  79 #undef PERL_IN_XSUB_RE
  80
  81 #ifdef PERL_IN_XSUB_RE
  82 #  include "re_comp.h"
  83 #else
  84 #  include "regcomp.h"
  85 #endif
  86
  87 #define RF_tainted 1  /* tainted information used? */
  88 #define RF_warned 2  /* warned about big count? */
  89
  90 #define RF_utf8  8  /* Pattern contains multibyte chars? */
  91
  92 #define UTF ((PL_reg_flags & RF_utf8) != 0)
  93
  94 #define RS_init  1  /* eval environment created */
  95 #define RS_set  2  /* replsv value is set */
  96
  97 #ifndef STATIC
  98 #define STATIC static
  99 #endif
 100
 101 #define REGINCLASS(prog,p,c)  (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
 102
 103 /*
 104  * Forwards.
 105  */
 106
 107 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
 108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
 109
 110 #define HOPc(pos,off) \
 111   (char *)(PL_reg_match_utf8 \
 112    ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
 113    : (U8*)(pos + off))
 114 #define HOPBACKc(pos, off) \
 115   (char*)(PL_reg_match_utf8\
 116    ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
 117    : (pos - off >= PL_bostr)  \
 118     ? (U8*)pos - off  \
 119     : NULL)
 120
 121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
 122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
 123
 124 /* these are unrolled below in the CCC_TRY_XXX defined */
 125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
 126  if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
 127
 128 /* Doesn't do an assert to verify that is correct */
 129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
 130  if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
 131
 132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
 133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
 134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
 135
 136 #define LOAD_UTF8_CHARCLASS_GCB()  /* Grapheme cluster boundaries */        \
 137   LOAD_UTF8_CHARCLASS(X_begin, " ");                                  \
 138   LOAD_UTF8_CHARCLASS(X_non_hangul, "A");                             \
 139   /* These are utf8 constants, and not utf-ebcdic constants, so the   \
 140    * assert should likely and hopefully fail on an EBCDIC machine */ \
 141   LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */             \
 142                    \
 143   /* No asserts are done for these, in case called on an early        \
 144    * Unicode version in which they map to nothing */               \
 145   LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
 146   LOAD_UTF8_CHARCLASS_NO_CHECK(X_L);     /* U+1100 "\xe1\x84\x80" */ \
 147   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV);     /* U+AC00 "\xea\xb0\x80" */ \
 148   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT);    /* U+AC01 "\xea\xb0\x81" */ \
 149   LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
 150   LOAD_UTF8_CHARCLASS_NO_CHECK(X_T);      /* U+11A8 "\xe1\x86\xa8" */ \
 151   LOAD_UTF8_CHARCLASS_NO_CHECK(X_V)       /* U+1160 "\xe1\x85\xa0" */
 152
 153 /*
 154    We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
 155    so that it is possible to override the option here without having to
 156    rebuild the entire core. as we are required to do if we change regcomp.h
 157    which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
 158 */
 159 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
 160 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
 161 #endif
 162
 163 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
 164 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS_ALNUM()
 165 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS_SPACE()
 166 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
 167 #define RE_utf8_perl_word   PL_utf8_alnum
 168 #define RE_utf8_perl_space  PL_utf8_space
 169 #define RE_utf8_posix_digit PL_utf8_digit
 170 #define perl_word  alnum
 171 #define perl_space space
 172 #define posix_digit digit
 173 #else
 174 #define LOAD_UTF8_CHARCLASS_PERL_WORD()   LOAD_UTF8_CHARCLASS(perl_word,"a")
 175 #define LOAD_UTF8_CHARCLASS_PERL_SPACE()  LOAD_UTF8_CHARCLASS(perl_space," ")
 176 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
 177 #define RE_utf8_perl_word   PL_utf8_perl_word
 178 #define RE_utf8_perl_space  PL_utf8_perl_space
 179 #define RE_utf8_posix_digit PL_utf8_posix_digit
 180 #endif
 181
 182
 183 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                          \
 184   case NAMEL:                                                              \
 185    PL_reg_flags |= RF_tainted;                                                 \
 186    /* FALL THROUGH */                                                          \
 187   case NAME:                                                                     \
 188    if (!nextchr)                                                               \
 189     sayNO;                                                                  \
 190    if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 191     if (!CAT2(PL_utf8_,CLASS)) {                                            \
 192      bool ok;                                                            \
 193      ENTER;                                                              \
 194      save_re_context();                                                  \
 195      ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 196      assert(ok);                                                         \
 197      LEAVE;                                                              \
 198     }                                                                       \
 199     if (!(OP(scan) == NAME                                                  \
 200      ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)   \
 201      : LCFUNC_utf8((U8*)locinput)))                                      \
 202     {                                                                       \
 203      sayNO;                                                              \
 204     }                                                                       \
 205     locinput += PL_utf8skip[nextchr];                                       \
 206     nextchr = UCHARAT(locinput);                                            \
 207     break;                                                                  \
 208    }                                                                           \
 209    if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                  \
 210     sayNO;                                                                  \
 211    nextchr = UCHARAT(++locinput);                                              \
 212    break
 213
 214 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC)                        \
 215   case NAMEL:                                                              \
 216    PL_reg_flags |= RF_tainted;                                                 \
 217    /* FALL THROUGH */                                                          \
 218   case NAME :                                                                     \
 219    if (!nextchr && locinput >= PL_regeol)                                      \
 220     sayNO;                                                                  \
 221    if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) {                                \
 222     if (!CAT2(PL_utf8_,CLASS)) {                                            \
 223      bool ok;                                                            \
 224      ENTER;                                                              \
 225      save_re_context();                                                  \
 226      ok=CAT2(is_utf8_,CLASS)((const U8*)STR);                            \
 227      assert(ok);                                                         \
 228      LEAVE;                                                              \
 229     }                                                                       \
 230     if ((OP(scan) == NAME                                                  \
 231      ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)    \
 232      : LCFUNC_utf8((U8*)locinput)))                                      \
 233     {                                                                       \
 234      sayNO;                                                              \
 235     }                                                                       \
 236     locinput += PL_utf8skip[nextchr];                                       \
 237     nextchr = UCHARAT(locinput);                                            \
 238     break;                                                                  \
 239    }                                                                           \
 240    if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr)))                   \
 241     sayNO;                                                                  \
 242    nextchr = UCHARAT(++locinput);                                              \
 243    break
 244
 245
 246
 247
 248
 249 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
 250
 251 /* for use after a quantifier and before an EXACT-like node -- japhy */
 252 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
 253 #define JUMPABLE(rn) (      \
 254  OP(rn) == OPEN ||       \
 255  (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
 256  OP(rn) == EVAL ||   \
 257  OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
 258  OP(rn) == PLUS || OP(rn) == MINMOD || \
 259  OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
 260  (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
 261 )
 262 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
 263
 264 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
 265
 266 #if 0
 267 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
 268    we don't need this definition. */
 269 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
 270 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  || OP(rn)==REFF  || OP(rn)==NREFF  )
 271 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 272
 273 #else
 274 /* ... so we use this as its faster. */
 275 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
 276 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 277 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 278
 279 #endif
 280
 281 /*
 282   Search for mandatory following text node; for lookahead, the text must
 283   follow but for lookbehind (rn->flags != 0) we skip to the next step.
 284 */
 285 #define FIND_NEXT_IMPT(rn) STMT_START { \
 286  while (JUMPABLE(rn)) { \
 287   const OPCODE type = OP(rn); \
 288   if (type == SUSPEND || PL_regkind[type] == CURLY) \
 289    rn = NEXTOPER(NEXTOPER(rn)); \
 290   else if (type == PLUS) \
 291    rn = NEXTOPER(rn); \
 292   else if (type == IFMATCH) \
 293    rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
 294   else rn += NEXT_OFF(rn); \
 295  } \
 296 } STMT_END
 297
 298
 299 static void restore_pos(pTHX_ void *arg);
 300
 301 STATIC CHECKPOINT
 302 S_regcppush(pTHX_ I32 parenfloor)
 303 {
 304  dVAR;
 305  const int retval = PL_savestack_ix;
 306 #define REGCP_PAREN_ELEMS 4
 307  const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
 308  int p;
 309  GET_RE_DEBUG_FLAGS_DECL;
 310
 311  if (paren_elems_to_push < 0)
 312   Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
 313
 314 #define REGCP_OTHER_ELEMS 7
 315  SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
 316
 317  for (p = PL_regsize; p > parenfloor; p--) {
 318 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
 319   SSPUSHINT(PL_regoffs[p].end);
 320   SSPUSHINT(PL_regoffs[p].start);
 321   SSPUSHPTR(PL_reg_start_tmp[p]);
 322   SSPUSHINT(p);
 323   DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
 324   "     saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
 325      (UV)p, (IV)PL_regoffs[p].start,
 326      (IV)(PL_reg_start_tmp[p] - PL_bostr),
 327      (IV)PL_regoffs[p].end
 328   ));
 329  }
 330 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
 331  SSPUSHPTR(PL_regoffs);
 332  SSPUSHINT(PL_regsize);
 333  SSPUSHINT(*PL_reglastparen);
 334  SSPUSHINT(*PL_reglastcloseparen);
 335  SSPUSHPTR(PL_reginput);
 336 #define REGCP_FRAME_ELEMS 2
 337 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
 338  * are needed for the regexp context stack bookkeeping. */
 339  SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
 340  SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
 341
 342  return retval;
 343 }
 344
 345 /* These are needed since we do not localize EVAL nodes: */
 346 #define REGCP_SET(cp)                                           \
 347  DEBUG_STATE_r(                                              \
 348    PerlIO_printf(Perl_debug_log,          \
 349     "  Setting an EVAL scope, savestack=%"IVdf"\n", \
 350     (IV)PL_savestack_ix));                          \
 351  cp = PL_savestack_ix
 352
 353 #define REGCP_UNWIND(cp)                                        \
 354  DEBUG_STATE_r(                                              \
 355   if (cp != PL_savestack_ix)                   \
 356     PerlIO_printf(Perl_debug_log,          \
 357     "  Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
 358     (IV)(cp), (IV)PL_savestack_ix));                \
 359  regcpblow(cp)
 360
 361 STATIC char *
 362 S_regcppop(pTHX_ const regexp *rex)
 363 {
 364  dVAR;
 365  U32 i;
 366  char *input;
 367  GET_RE_DEBUG_FLAGS_DECL;
 368
 369  PERL_ARGS_ASSERT_REGCPPOP;
 370
 371  /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
 372  i = SSPOPINT;
 373  assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
 374  i = SSPOPINT; /* Parentheses elements to pop. */
 375  input = (char *) SSPOPPTR;
 376  *PL_reglastcloseparen = SSPOPINT;
 377  *PL_reglastparen = SSPOPINT;
 378  PL_regsize = SSPOPINT;
 379  PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
 380
 381
 382  /* Now restore the parentheses context. */
 383  for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
 384   i > 0; i -= REGCP_PAREN_ELEMS) {
 385   I32 tmps;
 386   U32 paren = (U32)SSPOPINT;
 387   PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
 388   PL_regoffs[paren].start = SSPOPINT;
 389   tmps = SSPOPINT;
 390   if (paren <= *PL_reglastparen)
 391    PL_regoffs[paren].end = tmps;
 392   DEBUG_BUFFERS_r(
 393    PerlIO_printf(Perl_debug_log,
 394       "     restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
 395       (UV)paren, (IV)PL_regoffs[paren].start,
 396       (IV)(PL_reg_start_tmp[paren] - PL_bostr),
 397       (IV)PL_regoffs[paren].end,
 398       (paren > *PL_reglastparen ? "(no)" : ""));
 399   );
 400  }
 401  DEBUG_BUFFERS_r(
 402   if (*PL_reglastparen + 1 <= rex->nparens) {
 403    PerlIO_printf(Perl_debug_log,
 404       "     restoring \\%"IVdf"..\\%"IVdf" to undef\n",
 405       (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
 406   }
 407  );
 408 #if 1
 409  /* It would seem that the similar code in regtry()
 410  * already takes care of this, and in fact it is in
 411  * a better location to since this code can #if 0-ed out
 412  * but the code in regtry() is needed or otherwise tests
 413  * requiring null fields (pat.t#187 and split.t#{13,14}
 414  * (as of patchlevel 7877)  will fail.  Then again,
 415  * this code seems to be necessary or otherwise
 416  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
 417  * --jhi updated by dapm */
 418  for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
 419   if (i > PL_regsize)
 420    PL_regoffs[i].start = -1;
 421   PL_regoffs[i].end = -1;
 422  }
 423 #endif
 424  return input;
 425 }
 426
 427 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
 428
 429 /*
 430  * pregexec and friends
 431  */
 432
 433 #ifndef PERL_IN_XSUB_RE
 434 /*
 435  - pregexec - match a regexp against a string
 436  */
 437 I32
 438 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
 439   char *strbeg, I32 minend, SV *screamer, U32 nosave)
 440 /* strend: pointer to null at end of string */
 441 /* strbeg: real beginning of string */
 442 /* minend: end of match must be >=minend after stringarg. */
 443 /* nosave: For optimizations. */
 444 {
 445  PERL_ARGS_ASSERT_PREGEXEC;
 446
 447  return
 448   regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
 449      nosave ? 0 : REXEC_COPY_STR);
 450 }
 451 #endif
 452
 453 /*
 454  * Need to implement the following flags for reg_anch:
 455  *
 456  * USE_INTUIT_NOML  - Useful to call re_intuit_start() first
 457  * USE_INTUIT_ML
 458  * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
 459  * INTUIT_AUTORITATIVE_ML
 460  * INTUIT_ONCE_NOML  - Intuit can match in one location only.
 461  * INTUIT_ONCE_ML
 462  *
 463  * Another flag for this function: SECOND_TIME (so that float substrs
 464  * with giant delta may be not rechecked).
 465  */
 466
 467 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
 468
 469 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
 470    Otherwise, only SvCUR(sv) is used to get strbeg. */
 471
 472 /* XXXX We assume that strpos is strbeg unless sv. */
 473
 474 /* XXXX Some places assume that there is a fixed substring.
 475   An update may be needed if optimizer marks as "INTUITable"
 476   RExen without fixed substrings.  Similarly, it is assumed that
 477   lengths of all the strings are no more than minlen, thus they
 478   cannot come from lookahead.
 479   (Or minlen should take into account lookahead.)
 480   NOTE: Some of this comment is not correct. minlen does now take account
 481   of lookahead/behind. Further research is required. -- demerphq
 482
 483 */
 484
 485 /* A failure to find a constant substring means that there is no need to make
 486    an expensive call to REx engine, thus we celebrate a failure.  Similarly,
 487    finding a substring too deep into the string means that less calls to
 488    regtry() should be needed.
 489
 490    REx compiler's optimizer found 4 possible hints:
 491   a) Anchored substring;
 492   b) Fixed substring;
 493   c) Whether we are anchored (beginning-of-line or \G);
 494   d) First node (of those at offset 0) which may distingush positions;
 495    We use a)b)d) and multiline-part of c), and try to find a position in the
 496    string which does not contradict any of them.
 497  */
 498
 499 /* Most of decisions we do here should have been done at compile time.
 500    The nodes of the REx which we used for the search should have been
 501    deleted from the finite automaton. */
 502
 503 char *
 504 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
 505      char *strend, const U32 flags, re_scream_pos_data *data)
 506 {
 507  dVAR;
 508  struct regexp *const prog = (struct regexp *)SvANY(rx);
 509  register I32 start_shift = 0;
 510  /* Should be nonnegative! */
 511  register I32 end_shift   = 0;
 512  register char *s;
 513  register SV *check;
 514  char *strbeg;
 515  char *t;
 516  const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
 517  I32 ml_anch;
 518  register char *other_last = NULL; /* other substr checked before this */
 519  char *check_at = NULL;  /* check substr found at this pos */
 520  const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
 521  RXi_GET_DECL(prog,progi);
 522 #ifdef DEBUGGING
 523  const char * const i_strpos = strpos;
 524 #endif
 525  GET_RE_DEBUG_FLAGS_DECL;
 526
 527  PERL_ARGS_ASSERT_RE_INTUIT_START;
 528
 529  RX_MATCH_UTF8_set(rx,do_utf8);
 530
 531  if (RX_UTF8(rx)) {
 532   PL_reg_flags |= RF_utf8;
 533  }
 534  DEBUG_EXECUTE_r(
 535   debug_start_match(rx, do_utf8, strpos, strend,
 536    sv ? "Guessing start of match in sv for"
 537    : "Guessing start of match in string for");
 538    );
 539
 540  /* CHR_DIST() would be more correct here but it makes things slow. */
 541  if (prog->minlen > strend - strpos) {
 542   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 543        "String too short... [re_intuit_start]\n"));
 544   goto fail;
 545  }
 546
 547  strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
 548  PL_regeol = strend;
 549  if (do_utf8) {
 550   if (!prog->check_utf8 && prog->check_substr)
 551    to_utf8_substr(prog);
 552   check = prog->check_utf8;
 553  } else {
 554   if (!prog->check_substr && prog->check_utf8)
 555    to_byte_substr(prog);
 556   check = prog->check_substr;
 557  }
 558  if (check == &PL_sv_undef) {
 559   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 560     "Non-utf8 string cannot match utf8 check string\n"));
 561   goto fail;
 562  }
 563  if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
 564   ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
 565      || ( (prog->extflags & RXf_ANCH_BOL)
 566       && !multiline ) ); /* Check after \n? */
 567
 568   if (!ml_anch) {
 569   if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
 570     && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
 571    /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
 572    && sv && !SvROK(sv)
 573    && (strpos != strbeg)) {
 574    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
 575    goto fail;
 576   }
 577   if (prog->check_offset_min == prog->check_offset_max &&
 578    !(prog->extflags & RXf_CANY_SEEN)) {
 579    /* Substring at constant offset from beg-of-str... */
 580    I32 slen;
 581
 582    s = HOP3c(strpos, prog->check_offset_min, strend);
 583
 584    if (SvTAIL(check)) {
 585     slen = SvCUR(check); /* >= 1 */
 586
 587     if ( strend - s > slen || strend - s < slen - 1
 588      || (strend - s == slen && strend[-1] != '\n')) {
 589      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
 590      goto fail_finish;
 591     }
 592     /* Now should match s[0..slen-2] */
 593     slen--;
 594     if (slen && (*SvPVX_const(check) != *s
 595        || (slen > 1
 596         && memNE(SvPVX_const(check), s, slen)))) {
 597     report_neq:
 598      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
 599      goto fail_finish;
 600     }
 601    }
 602    else if (*SvPVX_const(check) != *s
 603      || ((slen = SvCUR(check)) > 1
 604       && memNE(SvPVX_const(check), s, slen)))
 605     goto report_neq;
 606    check_at = s;
 607    goto success_at_start;
 608   }
 609   }
 610   /* Match is anchored, but substr is not anchored wrt beg-of-str. */
 611   s = strpos;
 612   start_shift = prog->check_offset_min; /* okay to underestimate on CC */
 613   end_shift = prog->check_end_shift;
 614
 615   if (!ml_anch) {
 616    const I32 end = prog->check_offset_max + CHR_SVLEN(check)
 617           - (SvTAIL(check) != 0);
 618    const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
 619
 620    if (end_shift < eshift)
 621     end_shift = eshift;
 622   }
 623  }
 624  else {    /* Can match at random position */
 625   ml_anch = 0;
 626   s = strpos;
 627   start_shift = prog->check_offset_min;  /* okay to underestimate on CC */
 628   end_shift = prog->check_end_shift;
 629
 630   /* end shift should be non negative here */
 631  }
 632
 633 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
 634  if (end_shift < 0)
 635   Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
 636     (IV)end_shift, RX_PRECOMP(prog));
 637 #endif
 638
 639   restart:
 640  /* Find a possible match in the region s..strend by looking for
 641  the "check" substring in the region corrected by start/end_shift. */
 642
 643  {
 644   I32 srch_start_shift = start_shift;
 645   I32 srch_end_shift = end_shift;
 646   if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
 647    srch_end_shift -= ((strbeg - s) - srch_start_shift);
 648    srch_start_shift = strbeg - s;
 649   }
 650  DEBUG_OPTIMISE_MORE_r({
 651   PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
 652    (IV)prog->check_offset_min,
 653    (IV)srch_start_shift,
 654    (IV)srch_end_shift,
 655    (IV)prog->check_end_shift);
 656  });
 657
 658  if (flags & REXEC_SCREAM) {
 659   I32 p = -1;   /* Internal iterator of scream. */
 660   I32 * const pp = data ? data->scream_pos : &p;
 661
 662   if (PL_screamfirst[BmRARE(check)] >= 0
 663    || ( BmRARE(check) == '\n'
 664     && (BmPREVIOUS(check) == SvCUR(check) - 1)
 665     && SvTAIL(check) ))
 666    s = screaminstr(sv, check,
 667        srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
 668   else
 669    goto fail_finish;
 670   /* we may be pointing at the wrong string */
 671   if (s && RXp_MATCH_COPIED(prog))
 672    s = strbeg + (s - SvPVX_const(sv));
 673   if (data)
 674    *data->scream_olds = s;
 675  }
 676  else {
 677   U8* start_point;
 678   U8* end_point;
 679   if (prog->extflags & RXf_CANY_SEEN) {
 680    start_point= (U8*)(s + srch_start_shift);
 681    end_point= (U8*)(strend - srch_end_shift);
 682   } else {
 683    start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
 684    end_point= HOP3(strend, -srch_end_shift, strbeg);
 685   }
 686   DEBUG_OPTIMISE_MORE_r({
 687    PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
 688     (int)(end_point - start_point),
 689     (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
 690     start_point);
 691   });
 692
 693   s = fbm_instr( start_point, end_point,
 694      check, multiline ? FBMrf_MULTILINE : 0);
 695  }
 696  }
 697  /* Update the count-of-usability, remove useless subpatterns,
 698   unshift s.  */
 699
 700  DEBUG_EXECUTE_r({
 701   RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 702    SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
 703   PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
 704       (s ? "Found" : "Did not find"),
 705    (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
 706     ? "anchored" : "floating"),
 707    quoted,
 708    RE_SV_TAIL(check),
 709    (s ? " at offset " : "...\n") );
 710  });
 711
 712  if (!s)
 713   goto fail_finish;
 714  /* Finish the diagnostic message */
 715  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
 716
 717  /* XXX dmq: first branch is for positive lookbehind...
 718  Our check string is offset from the beginning of the pattern.
 719  So we need to do any stclass tests offset forward from that
 720  point. I think. :-(
 721  */
 722
 723
 724
 725  check_at=s;
 726
 727
 728  /* Got a candidate.  Check MBOL anchoring, and the *other* substr.
 729  Start with the other substr.
 730  XXXX no SCREAM optimization yet - and a very coarse implementation
 731  XXXX /ttx+/ results in anchored="ttx", floating="x".  floating will
 732     *always* match.  Probably should be marked during compile...
 733  Probably it is right to do no SCREAM here...
 734  */
 735
 736  if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
 737     : (prog->float_substr && prog->anchored_substr))
 738  {
 739   /* Take into account the "other" substring. */
 740   /* XXXX May be hopelessly wrong for UTF... */
 741   if (!other_last)
 742    other_last = strpos;
 743   if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
 744   do_other_anchored:
 745    {
 746     char * const last = HOP3c(s, -start_shift, strbeg);
 747     char *last1, *last2;
 748     char * const saved_s = s;
 749     SV* must;
 750
 751     t = s - prog->check_offset_max;
 752     if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 753      && (!do_utf8
 754       || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
 755        && t > strpos)))
 756      NOOP;
 757     else
 758      t = strpos;
 759     t = HOP3c(t, prog->anchored_offset, strend);
 760     if (t < other_last) /* These positions already checked */
 761      t = other_last;
 762     last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
 763     if (last < last1)
 764      last1 = last;
 765     /* XXXX It is not documented what units *_offsets are in.
 766     We assume bytes, but this is clearly wrong.
 767     Meaning this code needs to be carefully reviewed for errors.
 768     dmq.
 769     */
 770
 771     /* On end-of-str: see comment below. */
 772     must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
 773     if (must == &PL_sv_undef) {
 774      s = (char*)NULL;
 775      DEBUG_r(must = prog->anchored_utf8); /* for debug */
 776     }
 777     else
 778      s = fbm_instr(
 779       (unsigned char*)t,
 780       HOP3(HOP3(last1, prog->anchored_offset, strend)
 781         + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
 782       must,
 783       multiline ? FBMrf_MULTILINE : 0
 784      );
 785     DEBUG_EXECUTE_r({
 786      RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 787       SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 788      PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
 789       (s ? "Found" : "Contradicts"),
 790       quoted, RE_SV_TAIL(must));
 791     });
 792
 793
 794     if (!s) {
 795      if (last1 >= last2) {
 796       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 797             ", giving up...\n"));
 798       goto fail_finish;
 799      }
 800      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 801       ", trying floating at offset %ld...\n",
 802       (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
 803      other_last = HOP3c(last1, prog->anchored_offset+1, strend);
 804      s = HOP3c(last, 1, strend);
 805      goto restart;
 806     }
 807     else {
 808      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 809       (long)(s - i_strpos)));
 810      t = HOP3c(s, -prog->anchored_offset, strbeg);
 811      other_last = HOP3c(s, 1, strend);
 812      s = saved_s;
 813      if (t == strpos)
 814       goto try_at_start;
 815      goto try_at_offset;
 816     }
 817    }
 818   }
 819   else {  /* Take into account the floating substring. */
 820    char *last, *last1;
 821    char * const saved_s = s;
 822    SV* must;
 823
 824    t = HOP3c(s, -start_shift, strbeg);
 825    last1 = last =
 826     HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
 827    if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
 828     last = HOP3c(t, prog->float_max_offset, strend);
 829    s = HOP3c(t, prog->float_min_offset, strend);
 830    if (s < other_last)
 831     s = other_last;
 832  /* XXXX It is not documented what units *_offsets are in.  Assume bytes.  */
 833    must = do_utf8 ? prog->float_utf8 : prog->float_substr;
 834    /* fbm_instr() takes into account exact value of end-of-str
 835    if the check is SvTAIL(ed).  Since false positives are OK,
 836    and end-of-str is not later than strend we are OK. */
 837    if (must == &PL_sv_undef) {
 838     s = (char*)NULL;
 839     DEBUG_r(must = prog->float_utf8); /* for debug message */
 840    }
 841    else
 842     s = fbm_instr((unsigned char*)s,
 843        (unsigned char*)last + SvCUR(must)
 844         - (SvTAIL(must)!=0),
 845        must, multiline ? FBMrf_MULTILINE : 0);
 846    DEBUG_EXECUTE_r({
 847     RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
 848      SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
 849     PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
 850      (s ? "Found" : "Contradicts"),
 851      quoted, RE_SV_TAIL(must));
 852    });
 853    if (!s) {
 854     if (last1 == last) {
 855      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 856            ", giving up...\n"));
 857      goto fail_finish;
 858     }
 859     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
 860      ", trying anchored starting at offset %ld...\n",
 861      (long)(saved_s + 1 - i_strpos)));
 862     other_last = last;
 863     s = HOP3c(t, 1, strend);
 864     goto restart;
 865    }
 866    else {
 867     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
 868      (long)(s - i_strpos)));
 869     other_last = s; /* Fix this later. --Hugo */
 870     s = saved_s;
 871     if (t == strpos)
 872      goto try_at_start;
 873     goto try_at_offset;
 874    }
 875   }
 876  }
 877
 878
 879  t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
 880
 881  DEBUG_OPTIMISE_MORE_r(
 882   PerlIO_printf(Perl_debug_log,
 883    "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
 884    (IV)prog->check_offset_min,
 885    (IV)prog->check_offset_max,
 886    (IV)(s-strpos),
 887    (IV)(t-strpos),
 888    (IV)(t-s),
 889    (IV)(strend-strpos)
 890   )
 891  );
 892
 893  if (s - strpos > prog->check_offset_max  /* signed-corrected t > strpos */
 894   && (!do_utf8
 895    || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
 896     && t > strpos)))
 897  {
 898   /* Fixed substring is found far enough so that the match
 899   cannot start at strpos. */
 900  try_at_offset:
 901   if (ml_anch && t[-1] != '\n') {
 902    /* Eventually fbm_*() should handle this, but often
 903    anchored_offset is not 0, so this check will not be wasted. */
 904    /* XXXX In the code below we prefer to look for "^" even in
 905    presence of anchored substrings.  And we search even
 906    beyond the found float position.  These pessimizations
 907    are historical artefacts only.  */
 908   find_anchor:
 909    while (t < strend - prog->minlen) {
 910     if (*t == '\n') {
 911      if (t < check_at - prog->check_offset_min) {
 912       if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
 913        /* Since we moved from the found position,
 914        we definitely contradict the found anchored
 915        substr.  Due to the above check we do not
 916        contradict "check" substr.
 917        Thus we can arrive here only if check substr
 918        is float.  Redo checking for "other"=="fixed".
 919        */
 920        strpos = t + 1;
 921        DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
 922         PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
 923        goto do_other_anchored;
 924       }
 925       /* We don't contradict the found floating substring. */
 926       /* XXXX Why not check for STCLASS? */
 927       s = t + 1;
 928       DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
 929        PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
 930       goto set_useful;
 931      }
 932      /* Position contradicts check-string */
 933      /* XXXX probably better to look for check-string
 934      than for "\n", so one should lower the limit for t? */
 935      DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
 936       PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
 937      other_last = strpos = s = t + 1;
 938      goto restart;
 939     }
 940     t++;
 941    }
 942    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
 943       PL_colors[0], PL_colors[1]));
 944    goto fail_finish;
 945   }
 946   else {
 947    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
 948       PL_colors[0], PL_colors[1]));
 949   }
 950   s = t;
 951  set_useful:
 952   ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
 953  }
 954  else {
 955   /* The found string does not prohibit matching at strpos,
 956   - no optimization of calling REx engine can be performed,
 957   unless it was an MBOL and we are not after MBOL,
 958   or a future STCLASS check will fail this. */
 959  try_at_start:
 960   /* Even in this situation we may use MBOL flag if strpos is offset
 961   wrt the start of the string. */
 962   if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
 963    && (strpos != strbeg) && strpos[-1] != '\n'
 964    /* May be due to an implicit anchor of m{.*foo}  */
 965    && !(prog->intflags & PREGf_IMPLICIT))
 966   {
 967    t = strpos;
 968    goto find_anchor;
 969   }
 970   DEBUG_EXECUTE_r( if (ml_anch)
 971    PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
 972       (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
 973   );
 974  success_at_start:
 975   if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
 976    && (do_utf8 ? (
 977     prog->check_utf8  /* Could be deleted already */
 978     && --BmUSEFUL(prog->check_utf8) < 0
 979     && (prog->check_utf8 == prog->float_utf8)
 980    ) : (
 981     prog->check_substr  /* Could be deleted already */
 982     && --BmUSEFUL(prog->check_substr) < 0
 983     && (prog->check_substr == prog->float_substr)
 984    )))
 985   {
 986    /* If flags & SOMETHING - do not do it many times on the same match */
 987    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
 988    /* XXX Does the destruction order has to change with do_utf8? */
 989    SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
 990    SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
 991    prog->check_substr = prog->check_utf8 = NULL; /* disable */
 992    prog->float_substr = prog->float_utf8 = NULL; /* clear */
 993    check = NULL;   /* abort */
 994    s = strpos;
 995    /* XXXX This is a remnant of the old implementation.  It
 996      looks wasteful, since now INTUIT can use many
 997      other heuristics. */
 998    prog->extflags &= ~RXf_USE_INTUIT;
 999   }
1000   else
1001    s = strpos;
1002  }
1003
1004  /* Last resort... */
1005  /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1006  /* trie stclasses are too expensive to use here, we are better off to
1007  leave it to regmatch itself */
1008  if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1009   /* minlen == 0 is possible if regstclass is \b or \B,
1010   and the fixed substr is ''$.
1011   Since minlen is already taken into account, s+1 is before strend;
1012   accidentally, minlen >= 1 guaranties no false positives at s + 1
1013   even for \b or \B.  But (minlen? 1 : 0) below assumes that
1014   regstclass does not come from lookahead...  */
1015   /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1016   This leaves EXACTF only, which is dealt with in find_byclass().  */
1017   const U8* const str = (U8*)STRING(progi->regstclass);
1018   const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1019      ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1020      : 1);
1021   char * endpos;
1022   if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1023    endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1024   else if (prog->float_substr || prog->float_utf8)
1025    endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1026   else
1027    endpos= strend;
1028
1029   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1030          (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1031
1032   t = s;
1033   s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1034   if (!s) {
1035 #ifdef DEBUGGING
1036    const char *what = NULL;
1037 #endif
1038    if (endpos == strend) {
1039     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1040         "Could not match STCLASS...\n") );
1041     goto fail;
1042    }
1043    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1044         "This position contradicts STCLASS...\n") );
1045    if ((prog->extflags & RXf_ANCH) && !ml_anch)
1046     goto fail;
1047    /* Contradict one of substrings */
1048    if (prog->anchored_substr || prog->anchored_utf8) {
1049     if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1050      DEBUG_EXECUTE_r( what = "anchored" );
1051     hop_and_restart:
1052      s = HOP3c(t, 1, strend);
1053      if (s + start_shift + end_shift > strend) {
1054       /* XXXX Should be taken into account earlier? */
1055       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1056            "Could not match STCLASS...\n") );
1057       goto fail;
1058      }
1059      if (!check)
1060       goto giveup;
1061      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1062         "Looking for %s substr starting at offset %ld...\n",
1063         what, (long)(s + start_shift - i_strpos)) );
1064      goto restart;
1065     }
1066     /* Have both, check_string is floating */
1067     if (t + start_shift >= check_at) /* Contradicts floating=check */
1068      goto retry_floating_check;
1069     /* Recheck anchored substring, but not floating... */
1070     s = check_at;
1071     if (!check)
1072      goto giveup;
1073     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1074       "Looking for anchored substr starting at offset %ld...\n",
1075       (long)(other_last - i_strpos)) );
1076     goto do_other_anchored;
1077    }
1078    /* Another way we could have checked stclass at the
1079    current position only: */
1080    if (ml_anch) {
1081     s = t = t + 1;
1082     if (!check)
1083      goto giveup;
1084     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1085       "Looking for /%s^%s/m starting at offset %ld...\n",
1086       PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1087     goto try_at_offset;
1088    }
1089    if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1090     goto fail;
1091    /* Check is floating subtring. */
1092   retry_floating_check:
1093    t = check_at - start_shift;
1094    DEBUG_EXECUTE_r( what = "floating" );
1095    goto hop_and_restart;
1096   }
1097   if (t != s) {
1098    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1099       "By STCLASS: moving %ld --> %ld\n",
1100         (long)(t - i_strpos), (long)(s - i_strpos))
1101     );
1102   }
1103   else {
1104    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1105         "Does not contradict STCLASS...\n");
1106     );
1107   }
1108  }
1109   giveup:
1110  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1111       PL_colors[4], (check ? "Guessed" : "Giving up"),
1112       PL_colors[5], (long)(s - i_strpos)) );
1113  return s;
1114
1115   fail_finish:    /* Substring not found */
1116  if (prog->check_substr || prog->check_utf8)  /* could be removed already */
1117   BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1118   fail:
1119  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1120       PL_colors[4], PL_colors[5]));
1121  return NULL;
1122 }
1123
1124 #define DECL_TRIE_TYPE(scan) \
1125  const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1126      trie_type = (scan->flags != EXACT) \
1127        ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1128        : (do_utf8 ? trie_utf8 : trie_plain)
1129
1130 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
1131 uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
1132  switch (trie_type) {                                                    \
1133  case trie_utf8_fold:                                                    \
1134   if ( foldlen>0 ) {                                                  \
1135    uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1136    foldlen -= len;                                                 \
1137    uscan += len;                                                   \
1138    len=0;                                                          \
1139   } else {                                                            \
1140    uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1141    uvc = to_uni_fold( uvc, foldbuf, &foldlen );                    \
1142    foldlen -= UNISKIP( uvc );                                      \
1143    uscan = foldbuf + UNISKIP( uvc );                               \
1144   }                                                                   \
1145   break;                                                              \
1146  case trie_latin_utf8_fold:                                              \
1147   if ( foldlen>0 ) {                                                  \
1148    uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
1149    foldlen -= len;                                                 \
1150    uscan += len;                                                   \
1151    len=0;                                                          \
1152   } else {                                                            \
1153    len = 1;                                                        \
1154    uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
1155    foldlen -= UNISKIP( uvc );                                      \
1156    uscan = foldbuf + UNISKIP( uvc );                               \
1157   }                                                                   \
1158   break;                                                              \
1159  case trie_utf8:                                                         \
1160   uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
1161   break;                                                              \
1162  case trie_plain:                                                        \
1163   uvc = (UV)*uc;                                                      \
1164   len = 1;                                                            \
1165  }                                                                       \
1166  if (uvc < 256) {                                                        \
1167   charid = trie->charmap[ uvc ];                                      \
1168  }                                                                       \
1169  else {                                                                  \
1170   charid = 0;                                                         \
1171   if (widecharmap) {                                                  \
1172    SV** const svpp = hv_fetch(widecharmap,                         \
1173       (char*)&uvc, sizeof(UV), 0);                        \
1174    if (svpp)                                                       \
1175     charid = (U16)SvIV(*svpp);                                  \
1176   }                                                                   \
1177  }                                                                       \
1178 } STMT_END
1179
1180 #define REXEC_FBC_EXACTISH_CHECK(CoNd)                 \
1181 {                                                      \
1182  char *my_strend= (char *)strend;                   \
1183  if ( (CoNd)                                        \
1184   && (ln == len ||                              \
1185    !ibcmp_utf8(s, &my_strend, 0,  do_utf8,   \
1186       m, NULL, ln, (bool)UTF))       \
1187   && (!reginfo || regtry(reginfo, &s)) )        \
1188   goto got_it;                                   \
1189  else {                                             \
1190   U8 foldbuf[UTF8_MAXBYTES_CASE+1];             \
1191   uvchr_to_utf8(tmpbuf, c);                     \
1192   f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);  \
1193   if ( f != c                                   \
1194    && (f == c1 || f == c2)                  \
1195    && (ln == len ||                         \
1196     !ibcmp_utf8(s, &my_strend, 0,  do_utf8,\
1197        m, NULL, ln, (bool)UTF)) \
1198    && (!reginfo || regtry(reginfo, &s)) )   \
1199    goto got_it;                             \
1200  }                                                  \
1201 }                                                      \
1202 s += len
1203
1204 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
1205 STMT_START {                                              \
1206  while (s <= e) {                                      \
1207   if ( (CoNd)                                       \
1208    && (ln == 1 || !(OP(c) == EXACTF             \
1209        ? ibcmp(s, m, ln)           \
1210        : ibcmp_locale(s, m, ln)))  \
1211    && (!reginfo || regtry(reginfo, &s)) )        \
1212    goto got_it;                                  \
1213   s++;                                              \
1214  }                                                     \
1215 } STMT_END
1216
1217 #define REXEC_FBC_UTF8_SCAN(CoDe)                     \
1218 STMT_START {                                          \
1219  while (s + (uskip = UTF8SKIP(s)) <= strend) {     \
1220   CoDe                                          \
1221   s += uskip;                                   \
1222  }                                                 \
1223 } STMT_END
1224
1225 #define REXEC_FBC_SCAN(CoDe)                          \
1226 STMT_START {                                          \
1227  while (s < strend) {                              \
1228   CoDe                                          \
1229   s++;                                          \
1230  }                                                 \
1231 } STMT_END
1232
1233 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd)               \
1234 REXEC_FBC_UTF8_SCAN(                                  \
1235  if (CoNd) {                                       \
1236   if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1237    goto got_it;                              \
1238   else                                          \
1239    tmp = doevery;                            \
1240  }                                                 \
1241  else                                              \
1242   tmp = 1;                                      \
1243 )
1244
1245 #define REXEC_FBC_CLASS_SCAN(CoNd)                    \
1246 REXEC_FBC_SCAN(                                       \
1247  if (CoNd) {                                       \
1248   if (tmp && (!reginfo || regtry(reginfo, &s)))  \
1249    goto got_it;                              \
1250   else                                          \
1251    tmp = doevery;                            \
1252  }                                                 \
1253  else                                              \
1254   tmp = 1;                                      \
1255 )
1256
1257 #define REXEC_FBC_TRYIT               \
1258 if ((!reginfo || regtry(reginfo, &s))) \
1259  goto got_it
1260
1261 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd)                         \
1262  if (do_utf8) {                                             \
1263   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1264  }                                                          \
1265  else {                                                     \
1266   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1267  }                                                          \
1268  break
1269
1270 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd)      \
1271  if (do_utf8) {                                             \
1272   UtFpReLoAd;                                            \
1273   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1274  }                                                          \
1275  else {                                                     \
1276   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1277  }                                                          \
1278  break
1279
1280 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd)                   \
1281  PL_reg_flags |= RF_tainted;                                \
1282  if (do_utf8) {                                             \
1283   REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8);                   \
1284  }                                                          \
1285  else {                                                     \
1286   REXEC_FBC_CLASS_SCAN(CoNd);                            \
1287  }                                                          \
1288  break
1289
1290 #define DUMP_EXEC_POS(li,s,doutf8) \
1291  dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1292
1293 /* We know what class REx starts with.  Try to find this position... */
1294 /* if reginfo is NULL, its a dryrun */
1295 /* annoyingly all the vars in this routine have different names from their counterparts
1296    in regmatch. /grrr */
1297
1298 STATIC char *
1299 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1300  const char *strend, regmatch_info *reginfo)
1301 {
1302   dVAR;
1303   const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1304   char *m;
1305   STRLEN ln;
1306   STRLEN lnc;
1307   register STRLEN uskip;
1308   unsigned int c1;
1309   unsigned int c2;
1310   char *e;
1311   register I32 tmp = 1; /* Scratch variable? */
1312   register const bool do_utf8 = PL_reg_match_utf8;
1313   RXi_GET_DECL(prog,progi);
1314
1315   PERL_ARGS_ASSERT_FIND_BYCLASS;
1316
1317   /* We know what class it must start with. */
1318   switch (OP(c)) {
1319   case ANYOF:
1320    if (do_utf8) {
1321     REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1322       !UTF8_IS_INVARIANT((U8)s[0]) ?
1323       reginclass(prog, c, (U8*)s, 0, do_utf8) :
1324       REGINCLASS(prog, c, (U8*)s));
1325    }
1326    else {
1327     while (s < strend) {
1328      STRLEN skip = 1;
1329
1330      if (REGINCLASS(prog, c, (U8*)s) ||
1331       (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1332       /* The assignment of 2 is intentional:
1333        * for the folded sharp s, the skip is 2. */
1334       (skip = SHARP_S_SKIP))) {
1335       if (tmp && (!reginfo || regtry(reginfo, &s)))
1336         goto got_it;
1337       else
1338         tmp = doevery;
1339      }
1340      else
1341       tmp = 1;
1342      s += skip;
1343     }
1344    }
1345    break;
1346   case CANY:
1347    REXEC_FBC_SCAN(
1348     if (tmp && (!reginfo || regtry(reginfo, &s)))
1349      goto got_it;
1350     else
1351      tmp = doevery;
1352    );
1353    break;
1354   case EXACTF:
1355    m   = STRING(c);
1356    ln  = STR_LEN(c); /* length to match in octets/bytes */
1357    lnc = (I32) ln; /* length to match in characters */
1358    if (UTF) {
1359     STRLEN ulen1, ulen2;
1360     U8 *sm = (U8 *) m;
1361     U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1362     U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1363     /* used by commented-out code below */
1364     /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1365
1366     /* XXX: Since the node will be case folded at compile
1367     time this logic is a little odd, although im not
1368     sure that its actually wrong. --dmq */
1369
1370     c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1371     c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1372
1373     /* XXX: This is kinda strange. to_utf8_XYZ returns the
1374     codepoint of the first character in the converted
1375     form, yet originally we did the extra step.
1376     No tests fail by commenting this code out however
1377     so Ive left it out. -- dmq.
1378
1379     c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1380          0, uniflags);
1381     c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1382          0, uniflags);
1383     */
1384
1385     lnc = 0;
1386     while (sm < ((U8 *) m + ln)) {
1387      lnc++;
1388      sm += UTF8SKIP(sm);
1389     }
1390    }
1391    else {
1392     c1 = *(U8*)m;
1393     c2 = PL_fold[c1];
1394    }
1395    goto do_exactf;
1396   case EXACTFL:
1397    m   = STRING(c);
1398    ln  = STR_LEN(c);
1399    lnc = (I32) ln;
1400    c1 = *(U8*)m;
1401    c2 = PL_fold_locale[c1];
1402   do_exactf:
1403    e = HOP3c(strend, -((I32)lnc), s);
1404
1405    if (!reginfo && e < s)
1406     e = s;   /* Due to minlen logic of intuit() */
1407
1408    /* The idea in the EXACTF* cases is to first find the
1409    * first character of the EXACTF* node and then, if
1410    * necessary, case-insensitively compare the full
1411    * text of the node.  The c1 and c2 are the first
1412    * characters (though in Unicode it gets a bit
1413    * more complicated because there are more cases
1414    * than just upper and lower: one needs to use
1415    * the so-called folding case for case-insensitive
1416    * matching (called "loose matching" in Unicode).
1417    * ibcmp_utf8() will do just that. */
1418
1419    if (do_utf8 || UTF) {
1420     UV c, f;
1421     U8 tmpbuf [UTF8_MAXBYTES+1];
1422     STRLEN len = 1;
1423     STRLEN foldlen;
1424     const U32 uniflags = UTF8_ALLOW_DEFAULT;
1425     if (c1 == c2) {
1426      /* Upper and lower of 1st char are equal -
1427      * probably not a "letter". */
1428      while (s <= e) {
1429       if (do_utf8) {
1430        c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1431           uniflags);
1432       } else {
1433        c = *((U8*)s);
1434       }
1435       REXEC_FBC_EXACTISH_CHECK(c == c1);
1436      }
1437     }
1438     else {
1439      while (s <= e) {
1440       if (do_utf8) {
1441        c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1442           uniflags);
1443       } else {
1444        c = *((U8*)s);
1445       }
1446
1447       /* Handle some of the three Greek sigmas cases.
1448       * Note that not all the possible combinations
1449       * are handled here: some of them are handled
1450       * by the standard folding rules, and some of
1451       * them (the character class or ANYOF cases)
1452       * are handled during compiletime in
1453       * regexec.c:S_regclass(). */
1454       if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1455        c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1456        c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1457
1458       REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1459      }
1460     }
1461    }
1462    else {
1463     /* Neither pattern nor string are UTF8 */
1464     if (c1 == c2)
1465      REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1466     else
1467      REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1468    }
1469    break;
1470   case BOUNDL:
1471    PL_reg_flags |= RF_tainted;
1472    /* FALL THROUGH */
1473   case BOUND:
1474    if (do_utf8) {
1475     if (s == PL_bostr)
1476      tmp = '\n';
1477     else {
1478      U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1479      tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1480     }
1481     tmp = ((OP(c) == BOUND ?
1482       isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1483     LOAD_UTF8_CHARCLASS_ALNUM();
1484     REXEC_FBC_UTF8_SCAN(
1485      if (tmp == !(OP(c) == BOUND ?
1486         (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1487         isALNUM_LC_utf8((U8*)s)))
1488      {
1489       tmp = !tmp;
1490       REXEC_FBC_TRYIT;
1491     }
1492     );
1493    }
1494    else {
1495     tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1496     tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1497     REXEC_FBC_SCAN(
1498      if (tmp ==
1499       !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1500       tmp = !tmp;
1501       REXEC_FBC_TRYIT;
1502     }
1503     );
1504    }
1505    if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1506     goto got_it;
1507    break;
1508   case NBOUNDL:
1509    PL_reg_flags |= RF_tainted;
1510    /* FALL THROUGH */
1511   case NBOUND:
1512    if (do_utf8) {
1513     if (s == PL_bostr)
1514      tmp = '\n';
1515     else {
1516      U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1517      tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1518     }
1519     tmp = ((OP(c) == NBOUND ?
1520       isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1521     LOAD_UTF8_CHARCLASS_ALNUM();
1522     REXEC_FBC_UTF8_SCAN(
1523      if (tmp == !(OP(c) == NBOUND ?
1524         (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1525         isALNUM_LC_utf8((U8*)s)))
1526       tmp = !tmp;
1527      else REXEC_FBC_TRYIT;
1528     );
1529    }
1530    else {
1531     tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1532     tmp = ((OP(c) == NBOUND ?
1533       isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1534     REXEC_FBC_SCAN(
1535      if (tmp ==
1536       !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1537       tmp = !tmp;
1538      else REXEC_FBC_TRYIT;
1539     );
1540    }
1541    if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1542     goto got_it;
1543    break;
1544   case ALNUM:
1545    REXEC_FBC_CSCAN_PRELOAD(
1546     LOAD_UTF8_CHARCLASS_PERL_WORD(),
1547     swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1548     isALNUM(*s)
1549    );
1550   case ALNUML:
1551    REXEC_FBC_CSCAN_TAINT(
1552     isALNUM_LC_utf8((U8*)s),
1553     isALNUM_LC(*s)
1554    );
1555   case NALNUM:
1556    REXEC_FBC_CSCAN_PRELOAD(
1557     LOAD_UTF8_CHARCLASS_PERL_WORD(),
1558     !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1559     !isALNUM(*s)
1560    );
1561   case NALNUML:
1562    REXEC_FBC_CSCAN_TAINT(
1563     !isALNUM_LC_utf8((U8*)s),
1564     !isALNUM_LC(*s)
1565    );
1566   case SPACE:
1567    REXEC_FBC_CSCAN_PRELOAD(
1568     LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1569     *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
1570     isSPACE(*s)
1571    );
1572   case SPACEL:
1573    REXEC_FBC_CSCAN_TAINT(
1574     *s == ' ' || isSPACE_LC_utf8((U8*)s),
1575     isSPACE_LC(*s)
1576    );
1577   case NSPACE:
1578    REXEC_FBC_CSCAN_PRELOAD(
1579     LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1580     !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
1581     !isSPACE(*s)
1582    );
1583   case NSPACEL:
1584    REXEC_FBC_CSCAN_TAINT(
1585     !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1586     !isSPACE_LC(*s)
1587    );
1588   case DIGIT:
1589    REXEC_FBC_CSCAN_PRELOAD(
1590     LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1591     swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1592     isDIGIT(*s)
1593    );
1594   case DIGITL:
1595    REXEC_FBC_CSCAN_TAINT(
1596     isDIGIT_LC_utf8((U8*)s),
1597     isDIGIT_LC(*s)
1598    );
1599   case NDIGIT:
1600    REXEC_FBC_CSCAN_PRELOAD(
1601     LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1602     !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1603     !isDIGIT(*s)
1604    );
1605   case NDIGITL:
1606    REXEC_FBC_CSCAN_TAINT(
1607     !isDIGIT_LC_utf8((U8*)s),
1608     !isDIGIT_LC(*s)
1609    );
1610   case LNBREAK:
1611    REXEC_FBC_CSCAN(
1612     is_LNBREAK_utf8(s),
1613     is_LNBREAK_latin1(s)
1614    );
1615   case VERTWS:
1616    REXEC_FBC_CSCAN(
1617     is_VERTWS_utf8(s),
1618     is_VERTWS_latin1(s)
1619    );
1620   case NVERTWS:
1621    REXEC_FBC_CSCAN(
1622     !is_VERTWS_utf8(s),
1623     !is_VERTWS_latin1(s)
1624    );
1625   case HORIZWS:
1626    REXEC_FBC_CSCAN(
1627     is_HORIZWS_utf8(s),
1628     is_HORIZWS_latin1(s)
1629    );
1630   case NHORIZWS:
1631    REXEC_FBC_CSCAN(
1632     !is_HORIZWS_utf8(s),
1633     !is_HORIZWS_latin1(s)
1634    );
1635   case AHOCORASICKC:
1636   case AHOCORASICK:
1637    {
1638     DECL_TRIE_TYPE(c);
1639     /* what trie are we using right now */
1640     reg_ac_data *aho
1641      = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1642     reg_trie_data *trie
1643      = (reg_trie_data*)progi->data->data[ aho->trie ];
1644     HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1645
1646     const char *last_start = strend - trie->minlen;
1647 #ifdef DEBUGGING
1648     const char *real_start = s;
1649 #endif
1650     STRLEN maxlen = trie->maxlen;
1651     SV *sv_points;
1652     U8 **points; /* map of where we were in the input string
1653         when reading a given char. For ASCII this
1654         is unnecessary overhead as the relationship
1655         is always 1:1, but for Unicode, especially
1656         case folded Unicode this is not true. */
1657     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1658     U8 *bitmap=NULL;
1659
1660
1661     GET_RE_DEBUG_FLAGS_DECL;
1662
1663     /* We can't just allocate points here. We need to wrap it in
1664     * an SV so it gets freed properly if there is a croak while
1665     * running the match */
1666     ENTER;
1667     SAVETMPS;
1668     sv_points=newSV(maxlen * sizeof(U8 *));
1669     SvCUR_set(sv_points,
1670      maxlen * sizeof(U8 *));
1671     SvPOK_on(sv_points);
1672     sv_2mortal(sv_points);
1673     points=(U8**)SvPV_nolen(sv_points );
1674     if ( trie_type != trie_utf8_fold
1675      && (trie->bitmap || OP(c)==AHOCORASICKC) )
1676     {
1677      if (trie->bitmap)
1678       bitmap=(U8*)trie->bitmap;
1679      else
1680       bitmap=(U8*)ANYOF_BITMAP(c);
1681     }
1682     /* this is the Aho-Corasick algorithm modified a touch
1683     to include special handling for long "unknown char"
1684     sequences. The basic idea being that we use AC as long
1685     as we are dealing with a possible matching char, when
1686     we encounter an unknown char (and we have not encountered
1687     an accepting state) we scan forward until we find a legal
1688     starting char.
1689     AC matching is basically that of trie matching, except
1690     that when we encounter a failing transition, we fall back
1691     to the current states "fail state", and try the current char
1692     again, a process we repeat until we reach the root state,
1693     state 1, or a legal transition. If we fail on the root state
1694     then we can either terminate if we have reached an accepting
1695     state previously, or restart the entire process from the beginning
1696     if we have not.
1697
1698     */
1699     while (s <= last_start) {
1700      const U32 uniflags = UTF8_ALLOW_DEFAULT;
1701      U8 *uc = (U8*)s;
1702      U16 charid = 0;
1703      U32 base = 1;
1704      U32 state = 1;
1705      UV uvc = 0;
1706      STRLEN len = 0;
1707      STRLEN foldlen = 0;
1708      U8 *uscan = (U8*)NULL;
1709      U8 *leftmost = NULL;
1710 #ifdef DEBUGGING
1711      U32 accepted_word= 0;
1712 #endif
1713      U32 pointpos = 0;
1714
1715      while ( state && uc <= (U8*)strend ) {
1716       int failed=0;
1717       U32 word = aho->states[ state ].wordnum;
1718
1719       if( state==1 ) {
1720        if ( bitmap ) {
1721         DEBUG_TRIE_EXECUTE_r(
1722          if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1723           dump_exec_pos( (char *)uc, c, strend, real_start,
1724            (char *)uc, do_utf8 );
1725           PerlIO_printf( Perl_debug_log,
1726            " Scanning for legal start char...\n");
1727          }
1728         );
1729         while ( uc <= (U8*)last_start  && !BITMAP_TEST(bitmap,*uc) ) {
1730          uc++;
1731         }
1732         s= (char *)uc;
1733        }
1734        if (uc >(U8*)last_start) break;
1735       }
1736
1737       if ( word ) {
1738        U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1739        if (!leftmost || lpos < leftmost) {
1740         DEBUG_r(accepted_word=word);
1741         leftmost= lpos;
1742        }
1743        if (base==0) break;
1744
1745       }
1746       points[pointpos++ % maxlen]= uc;
1747       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1748            uscan, len, uvc, charid, foldlen,
1749            foldbuf, uniflags);
1750       DEBUG_TRIE_EXECUTE_r({
1751        dump_exec_pos( (char *)uc, c, strend, real_start,
1752         s,   do_utf8 );
1753        PerlIO_printf(Perl_debug_log,
1754         " Charid:%3u CP:%4"UVxf" ",
1755         charid, uvc);
1756       });
1757
1758       do {
1759 #ifdef DEBUGGING
1760        word = aho->states[ state ].wordnum;
1761 #endif
1762        base = aho->states[ state ].trans.base;
1763
1764        DEBUG_TRIE_EXECUTE_r({
1765         if (failed)
1766          dump_exec_pos( (char *)uc, c, strend, real_start,
1767           s,   do_utf8 );
1768         PerlIO_printf( Perl_debug_log,
1769          "%sState: %4"UVxf", word=%"UVxf,
1770          failed ? " Fail transition to " : "",
1771          (UV)state, (UV)word);
1772        });
1773        if ( base ) {
1774         U32 tmp;
1775         if (charid &&
1776          (base + charid > trie->uniquecharcount )
1777          && (base + charid - 1 - trie->uniquecharcount
1778            < trie->lasttrans)
1779          && trie->trans[base + charid - 1 -
1780            trie->uniquecharcount].check == state
1781          && (tmp=trie->trans[base + charid - 1 -
1782           trie->uniquecharcount ].next))
1783         {
1784          DEBUG_TRIE_EXECUTE_r(
1785           PerlIO_printf( Perl_debug_log," - legal\n"));
1786          state = tmp;
1787          break;
1788         }
1789         else {
1790          DEBUG_TRIE_EXECUTE_r(
1791           PerlIO_printf( Perl_debug_log," - fail\n"));
1792          failed = 1;
1793          state = aho->fail[state];
1794         }
1795        }
1796        else {
1797         /* we must be accepting here */
1798         DEBUG_TRIE_EXECUTE_r(
1799           PerlIO_printf( Perl_debug_log," - accepting\n"));
1800         failed = 1;
1801         break;
1802        }
1803       } while(state);
1804       uc += len;
1805       if (failed) {
1806        if (leftmost)
1807         break;
1808        if (!state) state = 1;
1809       }
1810      }
1811      if ( aho->states[ state ].wordnum ) {
1812       U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1813       if (!leftmost || lpos < leftmost) {
1814        DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1815        leftmost = lpos;
1816       }
1817      }
1818      if (leftmost) {
1819       s = (char*)leftmost;
1820       DEBUG_TRIE_EXECUTE_r({
1821        PerlIO_printf(
1822         Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1823         (UV)accepted_word, (IV)(s - real_start)
1824        );
1825       });
1826       if (!reginfo || regtry(reginfo, &s)) {
1827        FREETMPS;
1828        LEAVE;
1829        goto got_it;
1830       }
1831       s = HOPc(s,1);
1832       DEBUG_TRIE_EXECUTE_r({
1833        PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1834       });
1835      } else {
1836       DEBUG_TRIE_EXECUTE_r(
1837        PerlIO_printf( Perl_debug_log,"No match.\n"));
1838       break;
1839      }
1840     }
1841     FREETMPS;
1842     LEAVE;
1843    }
1844    break;
1845   default:
1846    Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1847    break;
1848   }
1849   return 0;
1850  got_it:
1851   return s;
1852 }
1853
1854
1855 /*
1856  - regexec_flags - match a regexp against a string
1857  */
1858 I32
1859 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1860    char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1861 /* strend: pointer to null at end of string */
1862 /* strbeg: real beginning of string */
1863 /* minend: end of match must be >=minend after stringarg. */
1864 /* data: May be used for some additional optimizations.
1865   Currently its only used, with a U32 cast, for transmitting
1866   the ganch offset when doing a /g match. This will change */
1867 /* nosave: For optimizations. */
1868 {
1869  dVAR;
1870  struct regexp *const prog = (struct regexp *)SvANY(rx);
1871  /*register*/ char *s;
1872  register regnode *c;
1873  /*register*/ char *startpos = stringarg;
1874  I32 minlen;  /* must match at least this many chars */
1875  I32 dontbother = 0; /* how many characters not to try at end */
1876  I32 end_shift = 0;   /* Same for the end. */  /* CC */
1877  I32 scream_pos = -1;  /* Internal iterator of scream. */
1878  char *scream_olds = NULL;
1879  const bool do_utf8 = (bool)DO_UTF8(sv);
1880  I32 multiline;
1881  RXi_GET_DECL(prog,progi);
1882  regmatch_info reginfo;  /* create some info to pass to regtry etc */
1883  regexp_paren_pair *swap = NULL;
1884  GET_RE_DEBUG_FLAGS_DECL;
1885
1886  PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1887  PERL_UNUSED_ARG(data);
1888
1889  /* Be paranoid... */
1890  if (prog == NULL || startpos == NULL) {
1891   Perl_croak(aTHX_ "NULL regexp parameter");
1892   return 0;
1893  }
1894
1895  multiline = prog->extflags & RXf_PMf_MULTILINE;
1896  reginfo.prog = rx;  /* Yes, sorry that this is confusing.  */
1897
1898  RX_MATCH_UTF8_set(rx, do_utf8);
1899  DEBUG_EXECUTE_r(
1900   debug_start_match(rx, do_utf8, startpos, strend,
1901   "Matching");
1902  );
1903
1904  minlen = prog->minlen;
1905
1906  if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1907   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1908        "String too short [regexec_flags]...\n"));
1909   goto phooey;
1910  }
1911
1912
1913  /* Check validity of program. */
1914  if (UCHARAT(progi->program) != REG_MAGIC) {
1915   Perl_croak(aTHX_ "corrupted regexp program");
1916  }
1917
1918  PL_reg_flags = 0;
1919  PL_reg_eval_set = 0;
1920  PL_reg_maxiter = 0;
1921
1922  if (RX_UTF8(rx))
1923   PL_reg_flags |= RF_utf8;
1924
1925  /* Mark beginning of line for ^ and lookbehind. */
1926  reginfo.bol = startpos; /* XXX not used ??? */
1927  PL_bostr  = strbeg;
1928  reginfo.sv = sv;
1929
1930  /* Mark end of line for $ (and such) */
1931  PL_regeol = strend;
1932
1933  /* see how far we have to get to not match where we matched before */
1934  reginfo.till = startpos+minend;
1935
1936  /* If there is a "must appear" string, look for it. */
1937  s = startpos;
1938
1939  if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1940   MAGIC *mg;
1941   if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
1942    reginfo.ganch = startpos + prog->gofs;
1943    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1944    "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1945   } else if (sv && SvTYPE(sv) >= SVt_PVMG
1946     && SvMAGIC(sv)
1947     && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1948     && mg->mg_len >= 0) {
1949    reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1950    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1951     "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
1952
1953    if (prog->extflags & RXf_ANCH_GPOS) {
1954     if (s > reginfo.ganch)
1955      goto phooey;
1956     s = reginfo.ganch - prog->gofs;
1957     DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1958      "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
1959     if (s < strbeg)
1960      goto phooey;
1961    }
1962   }
1963   else if (data) {
1964    reginfo.ganch = strbeg + PTR2UV(data);
1965    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1966     "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1967
1968   } else {    /* pos() not defined */
1969    reginfo.ganch = strbeg;
1970    DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1971     "GPOS: reginfo.ganch = strbeg\n"));
1972   }
1973  }
1974  if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1975   /* We have to be careful. If the previous successful match
1976   was from this regex we don't want a subsequent partially
1977   successful match to clobber the old results.
1978   So when we detect this possibility we add a swap buffer
1979   to the re, and switch the buffer each match. If we fail
1980   we switch it back, otherwise we leave it swapped.
1981   */
1982   swap = prog->offs;
1983   /* do we need a save destructor here for eval dies? */
1984   Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
1985  }
1986  if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1987   re_scream_pos_data d;
1988
1989   d.scream_olds = &scream_olds;
1990   d.scream_pos = &scream_pos;
1991   s = re_intuit_start(rx, sv, s, strend, flags, &d);
1992   if (!s) {
1993    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1994    goto phooey; /* not present */
1995   }
1996  }
1997
1998
1999
2000  /* Simplest case:  anchored match need be tried only once. */
2001  /*  [unless only anchor is BOL and multiline is set] */
2002  if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2003   if (s == startpos && regtry(&reginfo, &startpos))
2004    goto got_it;
2005   else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2006     || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2007   {
2008    char *end;
2009
2010    if (minlen)
2011     dontbother = minlen - 1;
2012    end = HOP3c(strend, -dontbother, strbeg) - 1;
2013    /* for multiline we only have to try after newlines */
2014    if (prog->check_substr || prog->check_utf8) {
2015     /* because of the goto we can not easily reuse the macros for bifurcating the
2016     unicode/non-unicode match modes here like we do elsewhere - demerphq */
2017     if (do_utf8) {
2018      if (s == startpos)
2019       goto after_try_utf8;
2020      while (1) {
2021       if (regtry(&reginfo, &s)) {
2022        goto got_it;
2023       }
2024      after_try_utf8:
2025       if (s > end) {
2026        goto phooey;
2027       }
2028       if (prog->extflags & RXf_USE_INTUIT) {
2029        s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2030        if (!s) {
2031         goto phooey;
2032        }
2033       }
2034       else {
2035        s += UTF8SKIP(s);
2036       }
2037      }
2038     } /* end search for check string in unicode */
2039     else {
2040      if (s == startpos) {
2041       goto after_try_latin;
2042      }
2043      while (1) {
2044       if (regtry(&reginfo, &s)) {
2045        goto got_it;
2046       }
2047      after_try_latin:
2048       if (s > end) {
2049        goto phooey;
2050       }
2051       if (prog->extflags & RXf_USE_INTUIT) {
2052        s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2053        if (!s) {
2054         goto phooey;
2055        }
2056       }
2057       else {
2058        s++;
2059       }
2060      }
2061     } /* end search for check string in latin*/
2062    } /* end search for check string */
2063    else { /* search for newline */
2064     if (s > startpos) {
2065      /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2066      s--;
2067     }
2068     /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2069     while (s < end) {
2070      if (*s++ == '\n') { /* don't need PL_utf8skip here */
2071       if (regtry(&reginfo, &s))
2072        goto got_it;
2073      }
2074     }
2075    } /* end search for newline */
2076   } /* end anchored/multiline check string search */
2077   goto phooey;
2078  } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2079  {
2080   /* the warning about reginfo.ganch being used without intialization
2081   is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2082   and we only enter this block when the same bit is set. */
2083   char *tmp_s = reginfo.ganch - prog->gofs;
2084
2085   if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2086    goto got_it;
2087   goto phooey;
2088  }
2089
2090  /* Messy cases:  unanchored match. */
2091  if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2092   /* we have /x+whatever/ */
2093   /* it must be a one character string (XXXX Except UTF?) */
2094   char ch;
2095 #ifdef DEBUGGING
2096   int did_match = 0;
2097 #endif
2098   if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2099    do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2100   ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
2101
2102   if (do_utf8) {
2103    REXEC_FBC_SCAN(
2104     if (*s == ch) {
2105      DEBUG_EXECUTE_r( did_match = 1 );
2106      if (regtry(&reginfo, &s)) goto got_it;
2107      s += UTF8SKIP(s);
2108      while (s < strend && *s == ch)
2109       s += UTF8SKIP(s);
2110     }
2111    );
2112   }
2113   else {
2114    REXEC_FBC_SCAN(
2115     if (*s == ch) {
2116      DEBUG_EXECUTE_r( did_match = 1 );
2117      if (regtry(&reginfo, &s)) goto got_it;
2118      s++;
2119      while (s < strend && *s == ch)
2120       s++;
2121     }
2122    );
2123   }
2124   DEBUG_EXECUTE_r(if (!did_match)
2125     PerlIO_printf(Perl_debug_log,
2126         "Did not find anchored character...\n")
2127    );
2128  }
2129  else if (prog->anchored_substr != NULL
2130    || prog->anchored_utf8 != NULL
2131    || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2132     && prog->float_max_offset < strend - s)) {
2133   SV *must;
2134   I32 back_max;
2135   I32 back_min;
2136   char *last;
2137   char *last1;  /* Last position checked before */
2138 #ifdef DEBUGGING
2139   int did_match = 0;
2140 #endif
2141   if (prog->anchored_substr || prog->anchored_utf8) {
2142    if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2143     do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2144    must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2145    back_max = back_min = prog->anchored_offset;
2146   } else {
2147    if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2148     do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2149    must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2150    back_max = prog->float_max_offset;
2151    back_min = prog->float_min_offset;
2152   }
2153
2154
2155   if (must == &PL_sv_undef)
2156    /* could not downgrade utf8 check substring, so must fail */
2157    goto phooey;
2158
2159   if (back_min<0) {
2160    last = strend;
2161   } else {
2162    last = HOP3c(strend, /* Cannot start after this */
2163     -(I32)(CHR_SVLEN(must)
2164       - (SvTAIL(must) != 0) + back_min), strbeg);
2165   }
2166   if (s > PL_bostr)
2167    last1 = HOPc(s, -1);
2168   else
2169    last1 = s - 1; /* bogus */
2170
2171   /* XXXX check_substr already used to find "s", can optimize if
2172   check_substr==must. */
2173   scream_pos = -1;
2174   dontbother = end_shift;
2175   strend = HOPc(strend, -dontbother);
2176   while ( (s <= last) &&
2177     ((flags & REXEC_SCREAM)
2178     ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2179          end_shift, &scream_pos, 0))
2180     : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2181         (unsigned char*)strend, must,
2182         multiline ? FBMrf_MULTILINE : 0))) ) {
2183    /* we may be pointing at the wrong string */
2184    if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2185     s = strbeg + (s - SvPVX_const(sv));
2186    DEBUG_EXECUTE_r( did_match = 1 );
2187    if (HOPc(s, -back_max) > last1) {
2188     last1 = HOPc(s, -back_min);
2189     s = HOPc(s, -back_max);
2190    }
2191    else {
2192     char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2193
2194     last1 = HOPc(s, -back_min);
2195     s = t;
2196    }
2197    if (do_utf8) {
2198     while (s <= last1) {
2199      if (regtry(&reginfo, &s))
2200       goto got_it;
2201      s += UTF8SKIP(s);
2202     }
2203    }
2204    else {
2205     while (s <= last1) {
2206      if (regtry(&reginfo, &s))
2207       goto got_it;
2208      s++;
2209     }
2210    }
2211   }
2212   DEBUG_EXECUTE_r(if (!did_match) {
2213    RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2214     SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2215    PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2216        ((must == prog->anchored_substr || must == prog->anchored_utf8)
2217        ? "anchored" : "floating"),
2218     quoted, RE_SV_TAIL(must));
2219   });
2220   goto phooey;
2221  }
2222  else if ( (c = progi->regstclass) ) {
2223   if (minlen) {
2224    const OPCODE op = OP(progi->regstclass);
2225    /* don't bother with what can't match */
2226    if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2227     strend = HOPc(strend, -(minlen - 1));
2228   }
2229   DEBUG_EXECUTE_r({
2230    SV * const prop = sv_newmortal();
2231    regprop(prog, prop, c);
2232    {
2233     RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2234      s,strend-s,60);
2235     PerlIO_printf(Perl_debug_log,
2236      "Matching stclass %.*s against %s (%d chars)\n",
2237      (int)SvCUR(prop), SvPVX_const(prop),
2238      quoted, (int)(strend - s));
2239    }
2240   });
2241   if (find_byclass(prog, c, s, strend, &reginfo))
2242    goto got_it;
2243   DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2244  }
2245  else {
2246   dontbother = 0;
2247   if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2248    /* Trim the end. */
2249    char *last;
2250    SV* float_real;
2251
2252    if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2253     do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2254    float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2255
2256    if (flags & REXEC_SCREAM) {
2257     last = screaminstr(sv, float_real, s - strbeg,
2258         end_shift, &scream_pos, 1); /* last one */
2259     if (!last)
2260      last = scream_olds; /* Only one occurrence. */
2261     /* we may be pointing at the wrong string */
2262     else if (RXp_MATCH_COPIED(prog))
2263      s = strbeg + (s - SvPVX_const(sv));
2264    }
2265    else {
2266     STRLEN len;
2267     const char * const little = SvPV_const(float_real, len);
2268
2269     if (SvTAIL(float_real)) {
2270      if (memEQ(strend - len + 1, little, len - 1))
2271       last = strend - len + 1;
2272      else if (!multiline)
2273       last = memEQ(strend - len, little, len)
2274        ? strend - len : NULL;
2275      else
2276       goto find_last;
2277     } else {
2278     find_last:
2279      if (len)
2280       last = rninstr(s, strend, little, little + len);
2281      else
2282       last = strend; /* matching "$" */
2283     }
2284    }
2285    if (last == NULL) {
2286     DEBUG_EXECUTE_r(
2287      PerlIO_printf(Perl_debug_log,
2288       "%sCan't trim the tail, match fails (should not happen)%s\n",
2289       PL_colors[4], PL_colors[5]));
2290     goto phooey; /* Should not happen! */
2291    }
2292    dontbother = strend - last + prog->float_min_offset;
2293   }
2294   if (minlen && (dontbother < minlen))
2295    dontbother = minlen - 1;
2296   strend -= dontbother;      /* this one's always in bytes! */
2297   /* We don't know much -- general case. */
2298   if (do_utf8) {
2299    for (;;) {
2300     if (regtry(&reginfo, &s))
2301      goto got_it;
2302     if (s >= strend)
2303      break;
2304     s += UTF8SKIP(s);
2305    };
2306   }
2307   else {
2308    do {
2309     if (regtry(&reginfo, &s))
2310      goto got_it;
2311    } while (s++ < strend);
2312   }
2313  }
2314
2315  /* Failure. */
2316  goto phooey;
2317
2318 got_it:
2319  Safefree(swap);
2320  RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2321
2322  if (PL_reg_eval_set)
2323   restore_pos(aTHX_ prog);
2324  if (RXp_PAREN_NAMES(prog))
2325   (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2326
2327  /* make sure $`, $&, $', and $digit will work later */
2328  if ( !(flags & REXEC_NOT_FIRST) ) {
2329   RX_MATCH_COPY_FREE(rx);
2330   if (flags & REXEC_COPY_STR) {
2331    const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2332 #ifdef PERL_OLD_COPY_ON_WRITE
2333    if ((SvIsCOW(sv)
2334     || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2335     if (DEBUG_C_TEST) {
2336      PerlIO_printf(Perl_debug_log,
2337         "Copy on write: regexp capture, type %d\n",
2338         (int) SvTYPE(sv));
2339     }
2340     prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2341     prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2342     assert (SvPOKp(prog->saved_copy));
2343    } else
2344 #endif
2345    {
2346     RX_MATCH_COPIED_on(rx);
2347     s = savepvn(strbeg, i);
2348     prog->subbeg = s;
2349    }
2350    prog->sublen = i;
2351   }
2352   else {
2353    prog->subbeg = strbeg;
2354    prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2355   }
2356  }
2357
2358  return 1;
2359
2360 phooey:
2361  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2362       PL_colors[4], PL_colors[5]));
2363  if (PL_reg_eval_set)
2364   restore_pos(aTHX_ prog);
2365  if (swap) {
2366   /* we failed :-( roll it back */
2367   Safefree(prog->offs);
2368   prog->offs = swap;
2369  }
2370
2371  return 0;
2372 }
2373
2374
2375 /*
2376  - regtry - try match at specific point
2377  */
2378 STATIC I32   /* 0 failure, 1 success */
2379 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2380 {
2381  dVAR;
2382  CHECKPOINT lastcp;
2383  REGEXP *const rx = reginfo->prog;
2384  regexp *const prog = (struct regexp *)SvANY(rx);
2385  RXi_GET_DECL(prog,progi);
2386  GET_RE_DEBUG_FLAGS_DECL;
2387
2388  PERL_ARGS_ASSERT_REGTRY;
2389
2390  reginfo->cutpoint=NULL;
2391
2392  if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2393   MAGIC *mg;
2394
2395   PL_reg_eval_set = RS_init;
2396   DEBUG_EXECUTE_r(DEBUG_s(
2397    PerlIO_printf(Perl_debug_log, "  setting stack tmpbase at %"IVdf"\n",
2398       (IV)(PL_stack_sp - PL_stack_base));
2399    ));
2400   SAVESTACK_CXPOS();
2401   cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2402   /* Otherwise OP_NEXTSTATE will free whatever on stack now.  */
2403   SAVETMPS;
2404   /* Apparently this is not needed, judging by wantarray. */
2405   /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2406   cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2407
2408   if (reginfo->sv) {
2409    /* Make $_ available to executed code. */
2410    if (reginfo->sv != DEFSV) {
2411     SAVE_DEFSV;
2412     DEFSV_set(reginfo->sv);
2413    }
2414
2415    if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2416     && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2417     /* prepare for quick setting of pos */
2418 #ifdef PERL_OLD_COPY_ON_WRITE
2419     if (SvIsCOW(reginfo->sv))
2420      sv_force_normal_flags(reginfo->sv, 0);
2421 #endif
2422     mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2423         &PL_vtbl_mglob, NULL, 0);
2424     mg->mg_len = -1;
2425    }
2426    PL_reg_magic    = mg;
2427    PL_reg_oldpos   = mg->mg_len;
2428    SAVEDESTRUCTOR_X(restore_pos, prog);
2429   }
2430   if (!PL_reg_curpm) {
2431    Newxz(PL_reg_curpm, 1, PMOP);
2432 #ifdef USE_ITHREADS
2433    {
2434     SV* const repointer = &PL_sv_undef;
2435     /* this regexp is also owned by the new PL_reg_curpm, which
2436     will try to free it.  */
2437     av_push(PL_regex_padav, repointer);
2438     PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2439     PL_regex_pad = AvARRAY(PL_regex_padav);
2440    }
2441 #endif
2442   }
2443 #ifdef USE_ITHREADS
2444   /* It seems that non-ithreads works both with and without this code.
2445   So for efficiency reasons it seems best not to have the code
2446   compiled when it is not needed.  */
2447   /* This is safe against NULLs: */
2448   ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2449   /* PM_reg_curpm owns a reference to this regexp.  */
2450   ReREFCNT_inc(rx);
2451 #endif
2452   PM_SETRE(PL_reg_curpm, rx);
2453   PL_reg_oldcurpm = PL_curpm;
2454   PL_curpm = PL_reg_curpm;
2455   if (RXp_MATCH_COPIED(prog)) {
2456    /*  Here is a serious problem: we cannot rewrite subbeg,
2457     since it may be needed if this match fails.  Thus
2458     $` inside (?{}) could fail... */
2459    PL_reg_oldsaved = prog->subbeg;
2460    PL_reg_oldsavedlen = prog->sublen;
2461 #ifdef PERL_OLD_COPY_ON_WRITE
2462    PL_nrs = prog->saved_copy;
2463 #endif
2464    RXp_MATCH_COPIED_off(prog);
2465   }
2466   else
2467    PL_reg_oldsaved = NULL;
2468   prog->subbeg = PL_bostr;
2469   prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2470  }
2471  DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2472  prog->offs[0].start = *startpos - PL_bostr;
2473  PL_reginput = *startpos;
2474  PL_reglastparen = &prog->lastparen;
2475  PL_reglastcloseparen = &prog->lastcloseparen;
2476  prog->lastparen = 0;
2477  prog->lastcloseparen = 0;
2478  PL_regsize = 0;
2479  PL_regoffs = prog->offs;
2480  if (PL_reg_start_tmpl <= prog->nparens) {
2481   PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2482   if(PL_reg_start_tmp)
2483    Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2484   else
2485    Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2486  }
2487
2488  /* XXXX What this code is doing here?!!!  There should be no need
2489  to do this again and again, PL_reglastparen should take care of
2490  this!  --ilya*/
2491
2492  /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2493  * Actually, the code in regcppop() (which Ilya may be meaning by
2494  * PL_reglastparen), is not needed at all by the test suite
2495  * (op/regexp, op/pat, op/split), but that code is needed otherwise
2496  * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2497  * Meanwhile, this code *is* needed for the
2498  * above-mentioned test suite tests to succeed.  The common theme
2499  * on those tests seems to be returning null fields from matches.
2500  * --jhi updated by dapm */
2501 #if 1
2502  if (prog->nparens) {
2503   regexp_paren_pair *pp = PL_regoffs;
2504   register I32 i;
2505   for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2506    ++pp;
2507    pp->start = -1;
2508    pp->end = -1;
2509   }
2510  }
2511 #endif
2512  REGCP_SET(lastcp);
2513  if (regmatch(reginfo, progi->program + 1)) {
2514   PL_regoffs[0].end = PL_reginput - PL_bostr;
2515   return 1;
2516  }
2517  if (reginfo->cutpoint)
2518   *startpos= reginfo->cutpoint;
2519  REGCP_UNWIND(lastcp);
2520  return 0;
2521 }
2522
2523
2524 #define sayYES goto yes
2525 #define sayNO goto no
2526 #define sayNO_SILENT goto no_silent
2527
2528 /* we dont use STMT_START/END here because it leads to
2529    "unreachable code" warnings, which are bogus, but distracting. */
2530 #define CACHEsayNO \
2531  if (ST.cache_mask) \
2532  PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2533  sayNO
2534
2535 /* this is used to determine how far from the left messages like
2536    'failed...' are printed. It should be set such that messages
2537    are inline with the regop output that created them.
2538 */
2539 #define REPORT_CODE_OFF 32
2540
2541
2542 /* Make sure there is a test for this +1 options in re_tests */
2543 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2544
2545 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2546 #define CHRTEST_VOID   -1000 /* the c1/c2 "next char" test should be skipped */
2547
2548 #define SLAB_FIRST(s) (&(s)->states[0])
2549 #define SLAB_LAST(s)  (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2550
2551 /* grab a new slab and return the first slot in it */
2552
2553 STATIC regmatch_state *
2554 S_push_slab(pTHX)
2555 {
2556 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2557  dMY_CXT;
2558 #endif
2559  regmatch_slab *s = PL_regmatch_slab->next;
2560  if (!s) {
2561   Newx(s, 1, regmatch_slab);
2562   s->prev = PL_regmatch_slab;
2563   s->next = NULL;
2564   PL_regmatch_slab->next = s;
2565  }
2566  PL_regmatch_slab = s;
2567  return SLAB_FIRST(s);
2568 }
2569
2570
2571 /* push a new state then goto it */
2572
2573 #define PUSH_STATE_GOTO(state, node) \
2574  scan = node; \
2575  st->resume_state = state; \
2576  goto push_state;
2577
2578 /* push a new state with success backtracking, then goto it */
2579
2580 #define PUSH_YES_STATE_GOTO(state, node) \
2581  scan = node; \
2582  st->resume_state = state; \
2583  goto push_yes_state;
2584
2585
2586
2587 /*
2588
2589 regmatch() - main matching routine
2590
2591 This is basically one big switch statement in a loop. We execute an op,
2592 set 'next' to point the next op, and continue. If we come to a point which
2593 we may need to backtrack to on failure such as (A|B|C), we push a
2594 backtrack state onto the backtrack stack. On failure, we pop the top
2595 state, and re-enter the loop at the state indicated. If there are no more
2596 states to pop, we return failure.
2597
2598 Sometimes we also need to backtrack on success; for example /A+/, where
2599 after successfully matching one A, we need to go back and try to
2600 match another one; similarly for lookahead assertions: if the assertion
2601 completes successfully, we backtrack to the state just before the assertion
2602 and then carry on.  In these cases, the pushed state is marked as
2603 'backtrack on success too'. This marking is in fact done by a chain of
2604 pointers, each pointing to the previous 'yes' state. On success, we pop to
2605 the nearest yes state, discarding any intermediate failure-only states.
2606 Sometimes a yes state is pushed just to force some cleanup code to be
2607 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2608 it to free the inner regex.
2609
2610 Note that failure backtracking rewinds the cursor position, while
2611 success backtracking leaves it alone.
2612
2613 A pattern is complete when the END op is executed, while a subpattern
2614 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2615 ops trigger the "pop to last yes state if any, otherwise return true"
2616 behaviour.
2617
2618 A common convention in this function is to use A and B to refer to the two
2619 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2620 the subpattern to be matched possibly multiple times, while B is the entire
2621 rest of the pattern. Variable and state names reflect this convention.
2622
2623 The states in the main switch are the union of ops and failure/success of
2624 substates associated with with that op.  For example, IFMATCH is the op
2625 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2626 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2627 successfully matched A and IFMATCH_A_fail is a state saying that we have
2628 just failed to match A. Resume states always come in pairs. The backtrack
2629 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2630 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2631 on success or failure.
2632
2633 The struct that holds a backtracking state is actually a big union, with
2634 one variant for each major type of op. The variable st points to the
2635 top-most backtrack struct. To make the code clearer, within each
2636 block of code we #define ST to alias the relevant union.
2637
2638 Here's a concrete example of a (vastly oversimplified) IFMATCH
2639 implementation:
2640
2641  switch (state) {
2642  ....
2643
2644 #define ST st->u.ifmatch
2645
2646  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2647   ST.foo = ...; // some state we wish to save
2648   ...
2649   // push a yes backtrack state with a resume value of
2650   // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2651   // first node of A:
2652   PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2653   // NOTREACHED
2654
2655  case IFMATCH_A: // we have successfully executed A; now continue with B
2656   next = B;
2657   bar = ST.foo; // do something with the preserved value
2658   break;
2659
2660  case IFMATCH_A_fail: // A failed, so the assertion failed
2661   ...;   // do some housekeeping, then ...
2662   sayNO; // propagate the failure
2663
2664 #undef ST
2665
2666  ...
2667  }
2668
2669 For any old-timers reading this who are familiar with the old recursive
2670 approach, the code above is equivalent to:
2671
2672  case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2673  {
2674   int foo = ...
2675   ...
2676   if (regmatch(A)) {
2677    next = B;
2678    bar = foo;
2679    break;
2680   }
2681   ...;   // do some housekeeping, then ...
2682   sayNO; // propagate the failure
2683  }
2684
2685 The topmost backtrack state, pointed to by st, is usually free. If you
2686 want to claim it, populate any ST.foo fields in it with values you wish to
2687 save, then do one of
2688
2689   PUSH_STATE_GOTO(resume_state, node);
2690   PUSH_YES_STATE_GOTO(resume_state, node);
2691
2692 which sets that backtrack state's resume value to 'resume_state', pushes a
2693 new free entry to the top of the backtrack stack, then goes to 'node'.
2694 On backtracking, the free slot is popped, and the saved state becomes the
2695 new free state. An ST.foo field in this new top state can be temporarily
2696 accessed to retrieve values, but once the main loop is re-entered, it
2697 becomes available for reuse.
2698
2699 Note that the depth of the backtrack stack constantly increases during the
2700 left-to-right execution of the pattern, rather than going up and down with
2701 the pattern nesting. For example the stack is at its maximum at Z at the
2702 end of the pattern, rather than at X in the following:
2703
2704  /(((X)+)+)+....(Y)+....Z/
2705
2706 The only exceptions to this are lookahead/behind assertions and the cut,
2707 (?>A), which pop all the backtrack states associated with A before
2708 continuing.
2709
2710 Bascktrack state structs are allocated in slabs of about 4K in size.
2711 PL_regmatch_state and st always point to the currently active state,
2712 and PL_regmatch_slab points to the slab currently containing
2713 PL_regmatch_state.  The first time regmatch() is called, the first slab is
2714 allocated, and is never freed until interpreter destruction. When the slab
2715 is full, a new one is allocated and chained to the end. At exit from
2716 regmatch(), slabs allocated since entry are freed.
2717
2718 */
2719
2720
2721 #define DEBUG_STATE_pp(pp)        \
2722  DEBUG_STATE_r({         \
2723   DUMP_EXEC_POS(locinput, scan, do_utf8);      \
2724   PerlIO_printf(Perl_debug_log,       \
2725    "    %*s"pp" %s%s%s%s%s\n",       \
2726    depth*2, "",        \
2727    PL_reg_name[st->resume_state],                     \
2728    ((st==yes_state||st==mark_state) ? "[" : ""),   \
2729    ((st==yes_state) ? "Y" : ""),                   \
2730    ((st==mark_state) ? "M" : ""),                  \
2731    ((st==yes_state||st==mark_state) ? "]" : "")    \
2732   );                                                  \
2733  });
2734
2735
2736 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2737
2738 #ifdef DEBUGGING
2739
2740 STATIC void
2741 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2742  const char *start, const char *end, const char *blurb)
2743 {
2744  const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2745
2746  PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2747
2748  if (!PL_colorset)
2749    reginitcolors();
2750  {
2751   RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2752    RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2753
2754   RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2755    start, end - start, 60);
2756
2757   PerlIO_printf(Perl_debug_log,
2758    "%s%s REx%s %s against %s\n",
2759      PL_colors[4], blurb, PL_colors[5], s0, s1);
2760
2761   if (do_utf8||utf8_pat)
2762    PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2763     utf8_pat ? "pattern" : "",
2764     utf8_pat && do_utf8 ? " and " : "",
2765     do_utf8 ? "string" : ""
2766    );
2767  }
2768 }
2769
2770 STATIC void
2771 S_dump_exec_pos(pTHX_ const char *locinput,
2772      const regnode *scan,
2773      const char *loc_regeol,
2774      const char *loc_bostr,
2775      const char *loc_reg_starttry,
2776      const bool do_utf8)
2777 {
2778  const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2779  const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2780  int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2781  /* The part of the string before starttry has one color
2782  (pref0_len chars), between starttry and current
2783  position another one (pref_len - pref0_len chars),
2784  after the current position the third one.
2785  We assume that pref0_len <= pref_len, otherwise we
2786  decrease pref0_len.  */
2787  int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2788   ? (5 + taill) - l : locinput - loc_bostr;
2789  int pref0_len;
2790
2791  PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2792
2793  while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2794   pref_len++;
2795  pref0_len = pref_len  - (locinput - loc_reg_starttry);
2796  if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2797   l = ( loc_regeol - locinput > (5 + taill) - pref_len
2798    ? (5 + taill) - pref_len : loc_regeol - locinput);
2799  while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2800   l--;
2801  if (pref0_len < 0)
2802   pref0_len = 0;
2803  if (pref0_len > pref_len)
2804   pref0_len = pref_len;
2805  {
2806   const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2807
2808   RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2809    (locinput - pref_len),pref0_len, 60, 4, 5);
2810
2811   RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2812      (locinput - pref_len + pref0_len),
2813      pref_len - pref0_len, 60, 2, 3);
2814
2815   RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2816      locinput, loc_regeol - locinput, 10, 0, 1);
2817
2818   const STRLEN tlen=len0+len1+len2;
2819   PerlIO_printf(Perl_debug_log,
2820      "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2821      (IV)(locinput - loc_bostr),
2822      len0, s0,
2823      len1, s1,
2824      (docolor ? "" : "> <"),
2825      len2, s2,
2826      (int)(tlen > 19 ? 0 :  19 - tlen),
2827      "");
2828  }
2829 }
2830
2831 #endif
2832
2833 /* reg_check_named_buff_matched()
2834  * Checks to see if a named buffer has matched. The data array of
2835  * buffer numbers corresponding to the buffer is expected to reside
2836  * in the regexp->data->data array in the slot stored in the ARG() of
2837  * node involved. Note that this routine doesn't actually care about the
2838  * name, that information is not preserved from compilation to execution.
2839  * Returns the index of the leftmost defined buffer with the given name
2840  * or 0 if non of the buffers matched.
2841  */
2842 STATIC I32
2843 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2844 {
2845  I32 n;
2846  RXi_GET_DECL(rex,rexi);
2847  SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2848  I32 *nums=(I32*)SvPVX(sv_dat);
2849
2850  PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2851
2852  for ( n=0; n<SvIVX(sv_dat); n++ ) {
2853   if ((I32)*PL_reglastparen >= nums[n] &&
2854    PL_regoffs[nums[n]].end != -1)
2855   {
2856    return nums[n];
2857   }
2858  }
2859  return 0;
2860 }
2861
2862
2863 /* free all slabs above current one  - called during LEAVE_SCOPE */
2864
2865 STATIC void
2866 S_clear_backtrack_stack(pTHX_ void *p)
2867 {
2868  regmatch_slab *s = PL_regmatch_slab->next;
2869  PERL_UNUSED_ARG(p);
2870
2871  if (!s)
2872   return;
2873  PL_regmatch_slab->next = NULL;
2874  while (s) {
2875   regmatch_slab * const osl = s;
2876   s = s->next;
2877   Safefree(osl);
2878  }
2879 }
2880
2881
2882 #define SETREX(Re1,Re2) \
2883  if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2884  Re1 = (Re2)
2885
2886 STATIC I32   /* 0 failure, 1 success */
2887 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2888 {
2889 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2890  dMY_CXT;
2891 #endif
2892  dVAR;
2893  register const bool do_utf8 = PL_reg_match_utf8;
2894  const U32 uniflags = UTF8_ALLOW_DEFAULT;
2895  REGEXP *rex_sv = reginfo->prog;
2896  regexp *rex = (struct regexp *)SvANY(rex_sv);
2897  RXi_GET_DECL(rex,rexi);
2898  I32 oldsave;
2899  /* the current state. This is a cached copy of PL_regmatch_state */
2900  register regmatch_state *st;
2901  /* cache heavy used fields of st in registers */
2902  register regnode *scan;
2903  register regnode *next;
2904  register U32 n = 0; /* general value; init to avoid compiler warning */
2905  register I32 ln = 0; /* len or last;  init to avoid compiler warning */
2906  register char *locinput = PL_reginput;
2907  register I32 nextchr;   /* is always set to UCHARAT(locinput) */
2908
2909  bool result = 0;     /* return value of S_regmatch */
2910  int depth = 0;     /* depth of backtrack stack */
2911  U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2912  const U32 max_nochange_depth =
2913   (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2914   3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2915  regmatch_state *yes_state = NULL; /* state to pop to on success of
2916                subpattern */
2917  /* mark_state piggy backs on the yes_state logic so that when we unwind
2918  the stack on success we can update the mark_state as we go */
2919  regmatch_state *mark_state = NULL; /* last mark state we have seen */
2920  regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2921  struct regmatch_state  *cur_curlyx = NULL; /* most recent curlyx */
2922  U32 state_num;
2923  bool no_final = 0;      /* prevent failure from backtracking? */
2924  bool do_cutgroup = 0;   /* no_final only until next branch/trie entry */
2925  char *startpoint = PL_reginput;
2926  SV *popmark = NULL;     /* are we looking for a mark? */
2927  SV *sv_commit = NULL;   /* last mark name seen in failure */
2928  SV *sv_yes_mark = NULL; /* last mark name we have seen
2929        during a successfull match */
2930  U32 lastopen = 0;       /* last open we saw */
2931  bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2932  SV* const oreplsv = GvSV(PL_replgv);
2933  /* these three flags are set by various ops to signal information to
2934  * the very next op. They have a useful lifetime of exactly one loop
2935  * iteration, and are not preserved or restored by state pushes/pops
2936  */
2937  bool sw = 0;     /* the condition value in (?(cond)a|b) */
2938  bool minmod = 0;     /* the next "{n,m}" is a "{n,m}?" */
2939  int logical = 0;     /* the following EVAL is:
2940         0: (?{...})
2941         1: (?(?{...})X|Y)
2942         2: (??{...})
2943        or the following IFMATCH/UNLESSM is:
2944         false: plain (?=foo)
2945         true:  used as a condition: (?(?=foo))
2946        */
2947 #ifdef DEBUGGING
2948  GET_RE_DEBUG_FLAGS_DECL;
2949 #endif
2950
2951  PERL_ARGS_ASSERT_REGMATCH;
2952
2953  DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2954    PerlIO_printf(Perl_debug_log,"regmatch start\n");
2955  }));
2956  /* on first ever call to regmatch, allocate first slab */
2957  if (!PL_regmatch_slab) {
2958   Newx(PL_regmatch_slab, 1, regmatch_slab);
2959   PL_regmatch_slab->prev = NULL;
2960   PL_regmatch_slab->next = NULL;
2961   PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2962  }
2963
2964  oldsave = PL_savestack_ix;
2965  SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2966  SAVEVPTR(PL_regmatch_slab);
2967  SAVEVPTR(PL_regmatch_state);
2968
2969  /* grab next free state slot */
2970  st = ++PL_regmatch_state;
2971  if (st >  SLAB_LAST(PL_regmatch_slab))
2972   st = PL_regmatch_state = S_push_slab(aTHX);
2973
2974  /* Note that nextchr is a byte even in UTF */
2975  nextchr = UCHARAT(locinput);
2976  scan = prog;
2977  while (scan != NULL) {
2978
2979   DEBUG_EXECUTE_r( {
2980    SV * const prop = sv_newmortal();
2981    regnode *rnext=regnext(scan);
2982    DUMP_EXEC_POS( locinput, scan, do_utf8 );
2983    regprop(rex, prop, scan);
2984
2985    PerlIO_printf(Perl_debug_log,
2986      "%3"IVdf":%*s%s(%"IVdf")\n",
2987      (IV)(scan - rexi->program), depth*2, "",
2988      SvPVX_const(prop),
2989      (PL_regkind[OP(scan)] == END || !rnext) ?
2990       0 : (IV)(rnext - rexi->program));
2991   });
2992
2993   next = scan + NEXT_OFF(scan);
2994   if (next == scan)
2995    next = NULL;
2996   state_num = OP(scan);
2997
2998   REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
2999  reenter_switch:
3000
3001   assert(PL_reglastparen == &rex->lastparen);
3002   assert(PL_reglastcloseparen == &rex->lastcloseparen);
3003   assert(PL_regoffs == rex->offs);
3004
3005   switch (state_num) {
3006   case BOL:
3007    if (locinput == PL_bostr)
3008    {
3009     /* reginfo->till = reginfo->bol; */
3010     break;
3011    }
3012    sayNO;
3013   case MBOL:
3014    if (locinput == PL_bostr ||
3015     ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3016    {
3017     break;
3018    }
3019    sayNO;
3020   case SBOL:
3021    if (locinput == PL_bostr)
3022     break;
3023    sayNO;
3024   case GPOS:
3025    if (locinput == reginfo->ganch)
3026     break;
3027    sayNO;
3028
3029   case KEEPS:
3030    /* update the startpoint */
3031    st->u.keeper.val = PL_regoffs[0].start;
3032    PL_reginput = locinput;
3033    PL_regoffs[0].start = locinput - PL_bostr;
3034    PUSH_STATE_GOTO(KEEPS_next, next);
3035    /*NOT-REACHED*/
3036   case KEEPS_next_fail:
3037    /* rollback the start point change */
3038    PL_regoffs[0].start = st->u.keeper.val;
3039    sayNO_SILENT;
3040    /*NOT-REACHED*/
3041   case EOL:
3042     goto seol;
3043   case MEOL:
3044    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3045     sayNO;
3046    break;
3047   case SEOL:
3048   seol:
3049    if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3050     sayNO;
3051    if (PL_regeol - locinput > 1)
3052     sayNO;
3053    break;
3054   case EOS:
3055    if (PL_regeol != locinput)
3056     sayNO;
3057    break;
3058   case SANY:
3059    if (!nextchr && locinput >= PL_regeol)
3060     sayNO;
3061    if (do_utf8) {
3062     locinput += PL_utf8skip[nextchr];
3063     if (locinput > PL_regeol)
3064      sayNO;
3065     nextchr = UCHARAT(locinput);
3066    }
3067    else
3068     nextchr = UCHARAT(++locinput);
3069    break;
3070   case CANY:
3071    if (!nextchr && locinput >= PL_regeol)
3072     sayNO;
3073    nextchr = UCHARAT(++locinput);
3074    break;
3075   case REG_ANY:
3076    if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3077     sayNO;
3078    if (do_utf8) {
3079     locinput += PL_utf8skip[nextchr];
3080     if (locinput > PL_regeol)
3081      sayNO;
3082     nextchr = UCHARAT(locinput);
3083    }
3084    else
3085     nextchr = UCHARAT(++locinput);
3086    break;
3087
3088 #undef  ST
3089 #define ST st->u.trie
3090   case TRIEC:
3091    /* In this case the charclass data is available inline so
3092    we can fail fast without a lot of extra overhead.
3093    */
3094    if (scan->flags == EXACT || !do_utf8) {
3095     if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3096      DEBUG_EXECUTE_r(
3097       PerlIO_printf(Perl_debug_log,
3098          "%*s  %sfailed to match trie start class...%s\n",
3099          REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3100      );
3101      sayNO_SILENT;
3102      /* NOTREACHED */
3103     }
3104    }
3105    /* FALL THROUGH */
3106   case TRIE:
3107    {
3108     /* what type of TRIE am I? (utf8 makes this contextual) */
3109     DECL_TRIE_TYPE(scan);
3110
3111     /* what trie are we using right now */
3112     reg_trie_data * const trie
3113      = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3114     HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3115     U32 state = trie->startstate;
3116
3117     if (trie->bitmap && trie_type != trie_utf8_fold &&
3118      !TRIE_BITMAP_TEST(trie,*locinput)
3119     ) {
3120      if (trie->states[ state ].wordnum) {
3121       DEBUG_EXECUTE_r(
3122        PerlIO_printf(Perl_debug_log,
3123           "%*s  %smatched empty string...%s\n",
3124           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3125       );
3126       break;
3127      } else {
3128       DEBUG_EXECUTE_r(
3129        PerlIO_printf(Perl_debug_log,
3130           "%*s  %sfailed to match trie start class...%s\n",
3131           REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3132       );
3133       sayNO_SILENT;
3134     }
3135     }
3136
3137    {
3138     U8 *uc = ( U8* )locinput;
3139
3140     STRLEN len = 0;
3141     STRLEN foldlen = 0;
3142     U8 *uscan = (U8*)NULL;
3143     STRLEN bufflen=0;
3144     SV *sv_accept_buff = NULL;
3145     U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3146
3147      ST.accepted = 0; /* how many accepting states we have seen */
3148     ST.B = next;
3149     ST.jump = trie->jump;
3150     ST.me = scan;
3151     /*
3152     traverse the TRIE keeping track of all accepting states
3153     we transition through until we get to a failing node.
3154     */
3155
3156     while ( state && uc <= (U8*)PL_regeol ) {
3157      U32 base = trie->states[ state ].trans.base;
3158      UV uvc = 0;
3159      U16 charid;
3160      /* We use charid to hold the wordnum as we don't use it
3161      for charid until after we have done the wordnum logic.
3162      We define an alias just so that the wordnum logic reads
3163      more naturally. */
3164
3165 #define got_wordnum charid
3166      got_wordnum = trie->states[ state ].wordnum;
3167
3168      if ( got_wordnum ) {
3169       if ( ! ST.accepted ) {
3170        ENTER;
3171        SAVETMPS; /* XXX is this necessary? dmq */
3172        bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3173        sv_accept_buff=newSV(bufflen *
3174            sizeof(reg_trie_accepted) - 1);
3175        SvCUR_set(sv_accept_buff, 0);
3176        SvPOK_on(sv_accept_buff);
3177        sv_2mortal(sv_accept_buff);
3178        SAVETMPS;
3179        ST.accept_buff =
3180         (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3181       }
3182       do {
3183        if (ST.accepted >= bufflen) {
3184         bufflen *= 2;
3185         ST.accept_buff =(reg_trie_accepted*)
3186          SvGROW(sv_accept_buff,
3187            bufflen * sizeof(reg_trie_accepted));
3188        }
3189        SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3190         + sizeof(reg_trie_accepted));
3191
3192
3193        ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3194        ST.accept_buff[ST.accepted].endpos = uc;
3195        ++ST.accepted;
3196       } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3197      }
3198 #undef got_wordnum
3199
3200      DEBUG_TRIE_EXECUTE_r({
3201         DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3202         PerlIO_printf( Perl_debug_log,
3203          "%*s  %sState: %4"UVxf" Accepted: %4"UVxf" ",
3204          2+depth * 2, "", PL_colors[4],
3205          (UV)state, (UV)ST.accepted );
3206      });
3207
3208      if ( base ) {
3209       REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3210            uscan, len, uvc, charid, foldlen,
3211            foldbuf, uniflags);
3212
3213       if (charid &&
3214        (base + charid > trie->uniquecharcount )
3215        && (base + charid - 1 - trie->uniquecharcount
3216          < trie->lasttrans)
3217        && trie->trans[base + charid - 1 -
3218          trie->uniquecharcount].check == state)
3219       {
3220        state = trie->trans[base + charid - 1 -
3221         trie->uniquecharcount ].next;
3222       }
3223       else {
3224        state = 0;
3225       }
3226       uc += len;
3227
3228      }
3229      else {
3230       state = 0;
3231      }
3232      DEBUG_TRIE_EXECUTE_r(
3233       PerlIO_printf( Perl_debug_log,
3234        "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3235        charid, uvc, (UV)state, PL_colors[5] );
3236      );
3237     }
3238     if (!ST.accepted )
3239     sayNO;
3240
3241     DEBUG_EXECUTE_r(
3242      PerlIO_printf( Perl_debug_log,
3243       "%*s  %sgot %"IVdf" possible matches%s\n",
3244       REPORT_CODE_OFF + depth * 2, "",
3245       PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3246     );
3247    }}
3248    goto trie_first_try; /* jump into the fail handler */
3249    /* NOTREACHED */
3250   case TRIE_next_fail: /* we failed - try next alterative */
3251    if ( ST.jump) {
3252     REGCP_UNWIND(ST.cp);
3253     for (n = *PL_reglastparen; n > ST.lastparen; n--)
3254      PL_regoffs[n].end = -1;
3255     *PL_reglastparen = n;
3256    }
3257   trie_first_try:
3258    if (do_cutgroup) {
3259     do_cutgroup = 0;
3260     no_final = 0;
3261    }
3262
3263    if ( ST.jump) {
3264     ST.lastparen = *PL_reglastparen;
3265     REGCP_SET(ST.cp);
3266    }
3267    if ( ST.accepted == 1 ) {
3268     /* only one choice left - just continue */
3269     DEBUG_EXECUTE_r({
3270      AV *const trie_words
3271       = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3272      SV ** const tmp = av_fetch( trie_words,
3273       ST.accept_buff[ 0 ].wordnum-1, 0 );
3274      SV *sv= tmp ? sv_newmortal() : NULL;
3275
3276      PerlIO_printf( Perl_debug_log,
3277       "%*s  %sonly one match left: #%d <%s>%s\n",
3278       REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3279       ST.accept_buff[ 0 ].wordnum,
3280       tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3281         PL_colors[0], PL_colors[1],
3282         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3283        )
3284       : "not compiled under -Dr",
3285       PL_colors[5] );
3286     });
3287     PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3288     /* in this case we free tmps/leave before we call regmatch
3289     as we wont be using accept_buff again. */
3290
3291     locinput = PL_reginput;
3292     nextchr = UCHARAT(locinput);
3293      if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3294       scan = ST.B;
3295      else
3296       scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3297     if (!has_cutgroup) {
3298      FREETMPS;
3299      LEAVE;
3300     } else {
3301      ST.accepted--;
3302      PUSH_YES_STATE_GOTO(TRIE_next, scan);
3303     }
3304
3305     continue; /* execute rest of RE */
3306    }
3307
3308    if ( !ST.accepted-- ) {
3309     DEBUG_EXECUTE_r({
3310      PerlIO_printf( Perl_debug_log,
3311       "%*s  %sTRIE failed...%s\n",
3312       REPORT_CODE_OFF+depth*2, "",
3313       PL_colors[4],
3314       PL_colors[5] );
3315     });
3316     FREETMPS;
3317     LEAVE;
3318     sayNO_SILENT;
3319     /*NOTREACHED*/
3320    }
3321
3322    /*
3323    There are at least two accepting states left.  Presumably
3324    the number of accepting states is going to be low,
3325    typically two. So we simply scan through to find the one
3326    with lowest wordnum.  Once we find it, we swap the last
3327    state into its place and decrement the size. We then try to
3328    match the rest of the pattern at the point where the word
3329    ends. If we succeed, control just continues along the
3330    regex; if we fail we return here to try the next accepting
3331    state
3332    */
3333
3334    {
3335     U32 best = 0;
3336     U32 cur;
3337     for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3338      DEBUG_TRIE_EXECUTE_r(
3339       PerlIO_printf( Perl_debug_log,
3340        "%*s  %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3341        REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3342        (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3343        ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3344      );
3345
3346      if (ST.accept_buff[cur].wordnum <
3347        ST.accept_buff[best].wordnum)
3348       best = cur;
3349     }
3350
3351     DEBUG_EXECUTE_r({
3352      AV *const trie_words
3353       = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3354      SV ** const tmp = av_fetch( trie_words,
3355       ST.accept_buff[ best ].wordnum - 1, 0 );
3356      regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3357          ST.B :
3358          ST.me + ST.jump[ST.accept_buff[best].wordnum];
3359      SV *sv= tmp ? sv_newmortal() : NULL;
3360
3361      PerlIO_printf( Perl_debug_log,
3362       "%*s  %strying alternation #%d <%s> at node #%d %s\n",
3363       REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3364       ST.accept_buff[best].wordnum,
3365       tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3366         PL_colors[0], PL_colors[1],
3367         (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3368        ) : "not compiled under -Dr",
3369        REG_NODE_NUM(nextop),
3370       PL_colors[5] );
3371     });
3372
3373     if ( best<ST.accepted ) {
3374      reg_trie_accepted tmp = ST.accept_buff[ best ];
3375      ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3376      ST.accept_buff[ ST.accepted ] = tmp;
3377      best = ST.accepted;
3378     }
3379     PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3380     if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3381      scan = ST.B;
3382     } else {
3383      scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3384     }
3385     PUSH_YES_STATE_GOTO(TRIE_next, scan);
3386     /* NOTREACHED */
3387    }
3388    /* NOTREACHED */
3389   case TRIE_next:
3390    /* we dont want to throw this away, see bug 57042*/
3391    if (oreplsv != GvSV(PL_replgv))
3392     sv_setsv(oreplsv, GvSV(PL_replgv));
3393    FREETMPS;
3394    LEAVE;
3395    sayYES;
3396 #undef  ST
3397
3398   case EXACT: {
3399    char *s = STRING(scan);
3400    ln = STR_LEN(scan);
3401    if (do_utf8 != UTF) {
3402     /* The target and the pattern have differing utf8ness. */
3403     char *l = locinput;
3404     const char * const e = s + ln;
3405
3406     if (do_utf8) {
3407      /* The target is utf8, the pattern is not utf8. */
3408      while (s < e) {
3409       STRLEN ulen;
3410       if (l >= PL_regeol)
3411        sayNO;
3412       if (NATIVE_TO_UNI(*(U8*)s) !=
3413        utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3414            uniflags))
3415        sayNO;
3416       l += ulen;
3417       s ++;
3418      }
3419     }
3420     else {
3421      /* The target is not utf8, the pattern is utf8. */
3422      while (s < e) {
3423       STRLEN ulen;
3424       if (l >= PL_regeol)
3425        sayNO;
3426       if (NATIVE_TO_UNI(*((U8*)l)) !=
3427        utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3428           uniflags))
3429        sayNO;
3430       s += ulen;
3431       l ++;
3432      }
3433     }
3434     locinput = l;
3435     nextchr = UCHARAT(locinput);
3436     break;
3437    }
3438    /* The target and the pattern have the same utf8ness. */
3439    /* Inline the first character, for speed. */
3440    if (UCHARAT(s) != nextchr)
3441     sayNO;
3442    if (PL_regeol - locinput < ln)
3443     sayNO;
3444    if (ln > 1 && memNE(s, locinput, ln))
3445     sayNO;
3446    locinput += ln;
3447    nextchr = UCHARAT(locinput);
3448    break;
3449    }
3450   case EXACTFL:
3451    PL_reg_flags |= RF_tainted;
3452    /* FALL THROUGH */
3453   case EXACTF: {
3454    char * const s = STRING(scan);
3455    ln = STR_LEN(scan);
3456
3457    if (do_utf8 || UTF) {
3458    /* Either target or the pattern are utf8. */
3459     const char * const l = locinput;
3460     char *e = PL_regeol;
3461
3462     if (ibcmp_utf8(s, 0,  ln, (bool)UTF,
3463        l, &e, 0,  do_utf8)) {
3464      /* One more case for the sharp s:
3465      * pack("U0U*", 0xDF) =~ /ss/i,
3466      * the 0xC3 0x9F are the UTF-8
3467      * byte sequence for the U+00DF. */
3468
3469      if (!(do_utf8 &&
3470       toLOWER(s[0]) == 's' &&
3471       ln >= 2 &&
3472       toLOWER(s[1]) == 's' &&
3473       (U8)l[0] == 0xC3 &&
3474       e - l >= 2 &&
3475       (U8)l[1] == 0x9F))
3476       sayNO;
3477     }
3478     locinput = e;
3479     nextchr = UCHARAT(locinput);
3480     break;
3481    }
3482
3483    /* Neither the target and the pattern are utf8. */
3484
3485    /* Inline the first character, for speed. */
3486    if (UCHARAT(s) != nextchr &&
3487     UCHARAT(s) != ((OP(scan) == EXACTF)
3488        ? PL_fold : PL_fold_locale)[nextchr])
3489     sayNO;
3490    if (PL_regeol - locinput < ln)
3491     sayNO;
3492    if (ln > 1 && (OP(scan) == EXACTF
3493       ? ibcmp(s, locinput, ln)
3494       : ibcmp_locale(s, locinput, ln)))
3495     sayNO;
3496    locinput += ln;
3497    nextchr = UCHARAT(locinput);
3498    break;
3499    }
3500   case BOUNDL:
3501   case NBOUNDL:
3502    PL_reg_flags |= RF_tainted;
3503    /* FALL THROUGH */
3504   case BOUND:
3505   case NBOUND:
3506    /* was last char in word? */
3507    if (do_utf8) {
3508     if (locinput == PL_bostr)
3509      ln = '\n';
3510     else {
3511      const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3512
3513      ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3514     }
3515     if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3516      ln = isALNUM_uni(ln);
3517      LOAD_UTF8_CHARCLASS_ALNUM();
3518      n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3519     }
3520     else {
3521      ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3522      n = isALNUM_LC_utf8((U8*)locinput);
3523     }
3524    }
3525    else {
3526     ln = (locinput != PL_bostr) ?
3527      UCHARAT(locinput - 1) : '\n';
3528     if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3529      ln = isALNUM(ln);
3530      n = isALNUM(nextchr);
3531     }
3532     else {
3533      ln = isALNUM_LC(ln);
3534      n = isALNUM_LC(nextchr);
3535     }
3536    }
3537    if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3538          OP(scan) == BOUNDL))
3539      sayNO;
3540    break;
3541   case ANYOF:
3542    if (do_utf8) {
3543     STRLEN inclasslen = PL_regeol - locinput;
3544
3545     if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3546      goto anyof_fail;
3547     if (locinput >= PL_regeol)
3548      sayNO;
3549     locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3550     nextchr = UCHARAT(locinput);
3551     break;
3552    }
3553    else {
3554     if (nextchr < 0)
3555      nextchr = UCHARAT(locinput);
3556     if (!REGINCLASS(rex, scan, (U8*)locinput))
3557      goto anyof_fail;
3558     if (!nextchr && locinput >= PL_regeol)
3559      sayNO;
3560     nextchr = UCHARAT(++locinput);
3561     break;
3562    }
3563   anyof_fail:
3564    /* If we might have the case of the German sharp s
3565    * in a casefolding Unicode character class. */
3566
3567    if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3568     locinput += SHARP_S_SKIP;
3569     nextchr = UCHARAT(locinput);
3570    }
3571    else
3572     sayNO;
3573    break;
3574   /* Special char classes - The defines start on line 129 or so */
3575   CCC_TRY_AFF( ALNUM,  ALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3576   CCC_TRY_NEG(NALNUM, NALNUML, perl_word,   "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3577
3578   CCC_TRY_AFF( SPACE,  SPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3579   CCC_TRY_NEG(NSPACE, NSPACEL, perl_space,  " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3580
3581   CCC_TRY_AFF( DIGIT,  DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3582   CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3583
3584   case CLUMP: /* Match \X: logical Unicode character.  This is defined as
3585      a Unicode extended Grapheme Cluster */
3586    /* From http://www.unicode.org/reports/tr29 (5.2 version).  An
3587    extended Grapheme Cluster is:
3588
3589    CR LF
3590    | Prepend* Begin Extend*
3591    | .
3592
3593    Begin is (Hangul-syllable | ! Control)
3594    Extend is (Grapheme_Extend | Spacing_Mark)
3595    Control is [ GCB_Control CR LF ]
3596
3597    The discussion below shows how the code for CLUMP is derived
3598    from this regex.  Note that most of these concepts are from
3599    property values of the Grapheme Cluster Boundary (GCB) property.
3600    No code point can have multiple property values for a given
3601    property.  Thus a code point in Prepend can't be in Control, but
3602    it must be in !Control.  This is why Control above includes
3603    GCB_Control plus CR plus LF.  The latter two are used in the GCB
3604    property separately, and so can't be in GCB_Control, even though
3605    they logically are controls.  Control is not the same as gc=cc,
3606    but includes format and other characters as well.
3607
3608    The Unicode definition of Hangul-syllable is:
3609     L+
3610     | (L* ( ( V | LV ) V* | LVT ) T*)
3611     | T+
3612     )
3613    Each of these is a value for the GCB property, and hence must be
3614    disjoint, so the order they are tested is immaterial, so the
3615    above can safely be changed to
3616     T+
3617     | L+
3618     | (L* ( LVT | ( V | LV ) V*) T*)
3619
3620    The last two terms can be combined like this:
3621     L* ( L
3622       | (( LVT | ( V | LV ) V*) T*))
3623
3624    And refactored into this:
3625     L* (L | LVT T* | V  V* T* | LV  V* T*)
3626
3627    That means that if we have seen any L's at all we can quit
3628    there, but if the next character is a LVT, a V or and LV we
3629    should keep going.
3630
3631    There is a subtlety with Prepend* which showed up in testing.
3632    Note that the Begin, and only the Begin is required in:
3633     | Prepend* Begin Extend*
3634    Also, Begin contains '! Control'.  A Prepend must be a '!
3635    Control', which means it must be a Begin.  What it comes down to
3636    is that if we match Prepend* and then find no suitable Begin
3637    afterwards, that if we backtrack the last Prepend, that one will
3638    be a suitable Begin.
3639    */
3640
3641    if (locinput >= PL_regeol)
3642     sayNO;
3643    if  (! do_utf8) {
3644
3645     /* Match either CR LF  or '.', as all the other possibilities
3646     * require utf8 */
3647     locinput++;     /* Match the . or CR */
3648     if (nextchr == '\r'
3649      && locinput < PL_regeol
3650      && UCHARAT(locinput) == '\n') locinput++;
3651    }
3652    else {
3653
3654     /* Utf8: See if is ( CR LF ); already know that locinput <
3655     * PL_regeol, so locinput+1 is in bounds */
3656     if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3657      locinput += 2;
3658     }
3659     else {
3660      /* In case have to backtrack to beginning, then match '.' */
3661      char *starting = locinput;
3662
3663      /* In case have to backtrack the last prepend */
3664      char *previous_prepend = 0;
3665
3666      LOAD_UTF8_CHARCLASS_GCB();
3667
3668      /* Match (prepend)* */
3669      while (locinput < PL_regeol
3670       && swash_fetch(PL_utf8_X_prepend,
3671           (U8*)locinput, do_utf8))
3672      {
3673       previous_prepend = locinput;
3674       locinput += UTF8SKIP(locinput);
3675      }
3676
3677      /* As noted above, if we matched a prepend character, but
3678      * the next thing won't match, back off the last prepend we
3679      * matched, as it is guaranteed to match the begin */
3680      if (previous_prepend
3681       && (locinput >=  PL_regeol
3682        || ! swash_fetch(PL_utf8_X_begin,
3683            (U8*)locinput, do_utf8)))
3684      {
3685       locinput = previous_prepend;
3686      }
3687
3688      /* Note that here we know PL_regeol > locinput, as we
3689      * tested that upon input to this switch case, and if we
3690      * moved locinput forward, we tested the result just above
3691      * and it either passed, or we backed off so that it will
3692      * now pass */
3693      if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) {
3694
3695       /* Here did not match the required 'Begin' in the
3696       * second term.  So just match the very first
3697       * character, the '.' of the final term of the regex */
3698       locinput = starting + UTF8SKIP(starting);
3699      } else {
3700
3701       /* Here is the beginning of a character that can have
3702       * an extender.  It is either a hangul syllable, or a
3703       * non-control */
3704       if (swash_fetch(PL_utf8_X_non_hangul,
3705           (U8*)locinput, do_utf8))
3706       {
3707
3708        /* Here not a Hangul syllable, must be a
3709        * ('!  * Control') */
3710        locinput += UTF8SKIP(locinput);
3711       } else {
3712
3713        /* Here is a Hangul syllable.  It can be composed
3714        * of several individual characters.  One
3715        * possibility is T+ */
3716        if (swash_fetch(PL_utf8_X_T,
3717            (U8*)locinput, do_utf8))
3718        {
3719         while (locinput < PL_regeol
3720           && swash_fetch(PL_utf8_X_T,
3721               (U8*)locinput, do_utf8))
3722         {
3723          locinput += UTF8SKIP(locinput);
3724         }
3725        } else {
3726
3727         /* Here, not T+, but is a Hangul.  That means
3728         * it is one of the others: L, LV, LVT or V,
3729         * and matches:
3730         * L* (L | LVT T* | V  V* T* | LV  V* T*) */
3731
3732         /* Match L*           */
3733         while (locinput < PL_regeol
3734           && swash_fetch(PL_utf8_X_L,
3735               (U8*)locinput, do_utf8))
3736         {
3737          locinput += UTF8SKIP(locinput);
3738         }
3739
3740         /* Here, have exhausted L*.  If the next
3741         * character is not an LV, LVT nor V, it means
3742         * we had to have at least one L, so matches L+
3743         * in the original equation, we have a complete
3744         * hangul syllable.  Are done. */
3745
3746         if (locinput < PL_regeol
3747          && swash_fetch(PL_utf8_X_LV_LVT_V,
3748              (U8*)locinput, do_utf8))
3749         {
3750
3751          /* Otherwise keep going.  Must be LV, LVT
3752          * or V.  See if LVT */
3753          if (swash_fetch(PL_utf8_X_LVT,
3754              (U8*)locinput, do_utf8))
3755          {
3756           locinput += UTF8SKIP(locinput);
3757          } else {
3758
3759           /* Must be  V or LV.  Take it, then
3760           * match V*     */
3761           locinput += UTF8SKIP(locinput);
3762           while (locinput < PL_regeol
3763             && swash_fetch(PL_utf8_X_V,
3764               (U8*)locinput, do_utf8))
3765           {
3766            locinput += UTF8SKIP(locinput);
3767           }
3768          }
3769
3770          /* And any of LV, LVT, or V can be followed
3771          * by T*            */
3772          while (locinput < PL_regeol
3773           && swash_fetch(PL_utf8_X_T,
3774               (U8*)locinput,
3775               do_utf8))
3776          {
3777           locinput += UTF8SKIP(locinput);
3778          }
3779         }
3780        }
3781       }
3782
3783       /* Match any extender */
3784       while (locinput < PL_regeol
3785         && swash_fetch(PL_utf8_X_extend,
3786             (U8*)locinput, do_utf8))
3787       {
3788        locinput += UTF8SKIP(locinput);
3789       }
3790      }
3791     }
3792     if (locinput > PL_regeol) sayNO;
3793    }
3794    nextchr = UCHARAT(locinput);
3795    break;
3796
3797   case NREFFL:
3798   {
3799    char *s;
3800    char type;
3801    PL_reg_flags |= RF_tainted;
3802    /* FALL THROUGH */
3803   case NREF:
3804   case NREFF:
3805    type = OP(scan);
3806    n = reg_check_named_buff_matched(rex,scan);
3807
3808    if ( n ) {
3809     type = REF + ( type - NREF );
3810     goto do_ref;
3811    } else {
3812     sayNO;
3813    }
3814    /* unreached */
3815   case REFFL:
3816    PL_reg_flags |= RF_tainted;
3817    /* FALL THROUGH */
3818   case REF:
3819   case REFF:
3820    n = ARG(scan);  /* which paren pair */
3821    type = OP(scan);
3822   do_ref:
3823    ln = PL_regoffs[n].start;
3824    PL_reg_leftiter = PL_reg_maxiter;  /* Void cache */
3825    if (*PL_reglastparen < n || ln == -1)
3826     sayNO;   /* Do not match unless seen CLOSEn. */
3827    if (ln == PL_regoffs[n].end)
3828     break;
3829
3830    s = PL_bostr + ln;
3831    if (do_utf8 && type != REF) { /* REF can do byte comparison */
3832     char *l = locinput;
3833     const char *e = PL_bostr + PL_regoffs[n].end;
3834     /*
3835     * Note that we can't do the "other character" lookup trick as
3836     * in the 8-bit case (no pun intended) because in Unicode we
3837     * have to map both upper and title case to lower case.
3838     */
3839     if (type == REFF) {
3840      while (s < e) {
3841       STRLEN ulen1, ulen2;
3842       U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3843       U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3844
3845       if (l >= PL_regeol)
3846        sayNO;
3847       toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3848       toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3849       if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3850        sayNO;
3851       s += ulen1;
3852       l += ulen2;
3853      }
3854     }
3855     locinput = l;
3856     nextchr = UCHARAT(locinput);
3857     break;
3858    }
3859
3860    /* Inline the first character, for speed. */
3861    if (UCHARAT(s) != nextchr &&
3862     (type == REF ||
3863     (UCHARAT(s) != (type == REFF
3864         ? PL_fold : PL_fold_locale)[nextchr])))
3865     sayNO;
3866    ln = PL_regoffs[n].end - ln;
3867    if (locinput + ln > PL_regeol)
3868     sayNO;
3869    if (ln > 1 && (type == REF
3870       ? memNE(s, locinput, ln)
3871       : (type == REFF
3872        ? ibcmp(s, locinput, ln)
3873        : ibcmp_locale(s, locinput, ln))))
3874     sayNO;
3875    locinput += ln;
3876    nextchr = UCHARAT(locinput);
3877    break;
3878   }
3879   case NOTHING:
3880   case TAIL:
3881    break;
3882   case BACK:
3883    break;
3884
3885 #undef  ST
3886 #define ST st->u.eval
3887   {
3888    SV *ret;
3889    REGEXP *re_sv;
3890    regexp *re;
3891    regexp_internal *rei;
3892    regnode *startpoint;
3893
3894   case GOSTART:
3895   case GOSUB: /*    /(...(?1))/   /(...(?&foo))/   */
3896    if (cur_eval && cur_eval->locinput==locinput) {
3897     if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3898      Perl_croak(aTHX_ "Infinite recursion in regex");
3899     if ( ++nochange_depth > max_nochange_depth )
3900      Perl_croak(aTHX_
3901       "Pattern subroutine nesting without pos change"
3902       " exceeded limit in regex");
3903    } else {
3904     nochange_depth = 0;
3905    }
3906    re_sv = rex_sv;
3907    re = rex;
3908    rei = rexi;
3909    (void)ReREFCNT_inc(rex_sv);
3910    if (OP(scan)==GOSUB) {
3911     startpoint = scan + ARG2L(scan);
3912     ST.close_paren = ARG(scan);
3913    } else {
3914     startpoint = rei->program+1;
3915     ST.close_paren = 0;
3916    }
3917    goto eval_recurse_doit;
3918    /* NOTREACHED */
3919   case EVAL:  /*   /(?{A})B/   /(??{A})B/  and /(?(?{A})X|Y)B/   */
3920    if (cur_eval && cur_eval->locinput==locinput) {
3921     if ( ++nochange_depth > max_nochange_depth )
3922      Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3923    } else {
3924     nochange_depth = 0;
3925    }
3926    {
3927     /* execute the code in the {...} */
3928     dSP;
3929     SV ** const before = SP;
3930     OP_4tree * const oop = PL_op;
3931     COP * const ocurcop = PL_curcop;
3932     PAD *old_comppad;
3933     char *saved_regeol = PL_regeol;
3934
3935     n = ARG(scan);
3936     PL_op = (OP_4tree*)rexi->data->data[n];
3937     DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3938      "  re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3939     PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3940     PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3941
3942     if (sv_yes_mark) {
3943      SV *sv_mrk = get_sv("REGMARK", 1);
3944      sv_setsv(sv_mrk, sv_yes_mark);
3945     }
3946
3947     CALLRUNOPS(aTHX);   /* Scalar context. */
3948     SPAGAIN;
3949     if (SP == before)
3950      ret = &PL_sv_undef;   /* protect against empty (?{}) blocks. */
3951     else {
3952      ret = POPs;
3953      PUTBACK;
3954     }
3955
3956     PL_op = oop;
3957     PAD_RESTORE_LOCAL(old_comppad);
3958     PL_curcop = ocurcop;
3959     PL_regeol = saved_regeol;
3960     if (!logical) {
3961      /* /(?{...})/ */
3962      sv_setsv(save_scalar(PL_replgv), ret);
3963      break;
3964     }
3965    }
3966    if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3967     logical = 0;
3968     {
3969      /* extract RE object from returned value; compiling if
3970      * necessary */
3971      MAGIC *mg = NULL;
3972      REGEXP *rx = NULL;
3973
3974      if (SvROK(ret)) {
3975       SV *const sv = SvRV(ret);
3976
3977       if (SvTYPE(sv) == SVt_REGEXP) {
3978        rx = (REGEXP*) sv;
3979       } else if (SvSMAGICAL(sv)) {
3980        mg = mg_find(sv, PERL_MAGIC_qr);
3981        assert(mg);
3982       }
3983      } else if (SvTYPE(ret) == SVt_REGEXP) {
3984       rx = (REGEXP*) ret;
3985      } else if (SvSMAGICAL(ret)) {
3986       if (SvGMAGICAL(ret)) {
3987        /* I don't believe that there is ever qr magic
3988        here.  */
3989        assert(!mg_find(ret, PERL_MAGIC_qr));
3990        sv_unmagic(ret, PERL_MAGIC_qr);
3991       }
3992       else {
3993        mg = mg_find(ret, PERL_MAGIC_qr);
3994        /* testing suggests mg only ends up non-NULL for
3995        scalars who were upgraded and compiled in the
3996        else block below. In turn, this is only
3997        triggered in the "postponed utf8 string" tests
3998        in t/op/pat.t  */
3999       }
4000      }
4001
4002      if (mg) {
4003       rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4004       assert(rx);
4005      }
4006      if (rx) {
4007       rx = reg_temp_copy(NULL, rx);
4008      }
4009      else {
4010       U32 pm_flags = 0;
4011       const I32 osize = PL_regsize;
4012
4013       if (DO_UTF8(ret)) {
4014        assert (SvUTF8(ret));
4015       } else if (SvUTF8(ret)) {
4016        /* Not doing UTF-8, despite what the SV says. Is
4017        this only if we're trapped in use 'bytes'?  */
4018        /* Make a copy of the octet sequence, but without
4019        the flag on, as the compiler now honours the
4020        SvUTF8 flag on ret.  */
4021        STRLEN len;
4022        const char *const p = SvPV(ret, len);
4023        ret = newSVpvn_flags(p, len, SVs_TEMP);
4024       }
4025       rx = CALLREGCOMP(ret, pm_flags);
4026       if (!(SvFLAGS(ret)
4027        & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4028         | SVs_GMG))) {
4029        /* This isn't a first class regexp. Instead, it's
4030        caching a regexp onto an existing, Perl visible
4031        scalar.  */
4032        sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4033       }
4034       PL_regsize = osize;
4035      }
4036      re_sv = rx;
4037      re = (struct regexp *)SvANY(rx);
4038     }
4039     RXp_MATCH_COPIED_off(re);
4040     re->subbeg = rex->subbeg;
4041     re->sublen = rex->sublen;
4042     rei = RXi_GET(re);
4043     DEBUG_EXECUTE_r(
4044      debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
4045       "Matching embedded");
4046     );
4047     startpoint = rei->program + 1;
4048      ST.close_paren = 0; /* only used for GOSUB */
4049      /* borrowed from regtry */
4050     if (PL_reg_start_tmpl <= re->nparens) {
4051      PL_reg_start_tmpl = re->nparens*3/2 + 3;
4052      if(PL_reg_start_tmp)
4053       Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4054      else
4055       Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4056     }
4057
4058   eval_recurse_doit: /* Share code with GOSUB below this line */
4059     /* run the pattern returned from (??{...}) */
4060     ST.cp = regcppush(0); /* Save *all* the positions. */
4061     REGCP_SET(ST.lastcp);
4062
4063     PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4064
4065     /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4066     PL_reglastparen = &re->lastparen;
4067     PL_reglastcloseparen = &re->lastcloseparen;
4068     re->lastparen = 0;
4069     re->lastcloseparen = 0;
4070
4071     PL_reginput = locinput;
4072     PL_regsize = 0;
4073
4074     /* XXXX This is too dramatic a measure... */
4075     PL_reg_maxiter = 0;
4076
4077     ST.toggle_reg_flags = PL_reg_flags;
4078     if (RX_UTF8(re_sv))
4079      PL_reg_flags |= RF_utf8;
4080     else
4081      PL_reg_flags &= ~RF_utf8;
4082     ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4083
4084     ST.prev_rex = rex_sv;
4085     ST.prev_curlyx = cur_curlyx;
4086     SETREX(rex_sv,re_sv);
4087     rex = re;
4088     rexi = rei;
4089     cur_curlyx = NULL;
4090     ST.B = next;
4091     ST.prev_eval = cur_eval;
4092     cur_eval = st;
4093     /* now continue from first node in postoned RE */
4094     PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4095     /* NOTREACHED */
4096    }
4097    /* logical is 1,   /(?(?{...})X|Y)/ */
4098    sw = (bool)SvTRUE(ret);
4099    logical = 0;
4100    break;
4101   }
4102
4103   case EVAL_AB: /* cleanup after a successful (??{A})B */
4104    /* note: this is called twice; first after popping B, then A */
4105    PL_reg_flags ^= ST.toggle_reg_flags;
4106    ReREFCNT_dec(rex_sv);
4107    SETREX(rex_sv,ST.prev_rex);
4108    rex = (struct regexp *)SvANY(rex_sv);
4109    rexi = RXi_GET(rex);
4110    regcpblow(ST.cp);
4111    cur_eval = ST.prev_eval;
4112    cur_curlyx = ST.prev_curlyx;
4113
4114    /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4115    PL_reglastparen = &rex->lastparen;
4116    PL_reglastcloseparen = &rex->lastcloseparen;
4117    /* also update PL_regoffs */
4118    PL_regoffs = rex->offs;
4119
4120    /* XXXX This is too dramatic a measure... */
4121    PL_reg_maxiter = 0;
4122    if ( nochange_depth )
4123     nochange_depth--;
4124    sayYES;
4125
4126
4127   case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4128    /* note: this is called twice; first after popping B, then A */
4129    PL_reg_flags ^= ST.toggle_reg_flags;
4130    ReREFCNT_dec(rex_sv);
4131    SETREX(rex_sv,ST.prev_rex);
4132    rex = (struct regexp *)SvANY(rex_sv);
4133    rexi = RXi_GET(rex);
4134    /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4135    PL_reglastparen = &rex->lastparen;
4136    PL_reglastcloseparen = &rex->lastcloseparen;
4137
4138    PL_reginput = locinput;
4139    REGCP_UNWIND(ST.lastcp);
4140    regcppop(rex);
4141    cur_eval = ST.prev_eval;
4142    cur_curlyx = ST.prev_curlyx;
4143    /* XXXX This is too dramatic a measure... */
4144    PL_reg_maxiter = 0;
4145    if ( nochange_depth )
4146     nochange_depth--;
4147    sayNO_SILENT;
4148 #undef ST
4149
4150   case OPEN:
4151    n = ARG(scan);  /* which paren pair */
4152    PL_reg_start_tmp[n] = locinput;
4153    if (n > PL_regsize)
4154     PL_regsize = n;
4155    lastopen = n;
4156    break;
4157   case CLOSE:
4158    n = ARG(scan);  /* which paren pair */
4159    PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4160    PL_regoffs[n].end = locinput - PL_bostr;
4161    /*if (n > PL_regsize)
4162     PL_regsize = n;*/
4163    if (n > *PL_reglastparen)
4164     *PL_reglastparen = n;
4165    *PL_reglastcloseparen = n;
4166    if (cur_eval && cur_eval->u.eval.close_paren == n) {
4167     goto fake_end;
4168    }
4169    break;
4170   case ACCEPT:
4171    if (ARG(scan)){
4172     regnode *cursor;
4173     for (cursor=scan;
4174      cursor && OP(cursor)!=END;
4175      cursor=regnext(cursor))
4176     {
4177      if ( OP(cursor)==CLOSE ){
4178       n = ARG(cursor);
4179       if ( n <= lastopen ) {
4180        PL_regoffs[n].start
4181         = PL_reg_start_tmp[n] - PL_bostr;
4182        PL_regoffs[n].end = locinput - PL_bostr;
4183        /*if (n > PL_regsize)
4184        PL_regsize = n;*/
4185        if (n > *PL_reglastparen)
4186         *PL_reglastparen = n;
4187        *PL_reglastcloseparen = n;
4188        if ( n == ARG(scan) || (cur_eval &&
4189         cur_eval->u.eval.close_paren == n))
4190         break;
4191       }
4192      }
4193     }
4194    }
4195    goto fake_end;
4196    /*NOTREACHED*/
4197   case GROUPP:
4198    n = ARG(scan);  /* which paren pair */
4199    sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4200    break;
4201   case NGROUPP:
4202    /* reg_check_named_buff_matched returns 0 for no match */
4203    sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
4204    break;
4205   case INSUBP:
4206    n = ARG(scan);
4207    sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4208    break;
4209   case DEFINEP:
4210    sw = 0;
4211    break;
4212   case IFTHEN:
4213    PL_reg_leftiter = PL_reg_maxiter;  /* Void cache */
4214    if (sw)
4215     next = NEXTOPER(NEXTOPER(scan));
4216    else {
4217     next = scan + ARG(scan);
4218     if (OP(next) == IFTHEN) /* Fake one. */
4219      next = NEXTOPER(NEXTOPER(next));
4220    }
4221    break;
4222   case LOGICAL:
4223    logical = scan->flags;
4224    break;
4225
4226 /*******************************************************************
4227
4228 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4229 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4230 STAR/PLUS/CURLY/CURLYN are used instead.)
4231
4232 A*B is compiled as <CURLYX><A><WHILEM><B>
4233
4234 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4235 state, which contains the current count, initialised to -1. It also sets
4236 cur_curlyx to point to this state, with any previous value saved in the
4237 state block.
4238
4239 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4240 since the pattern may possibly match zero times (i.e. it's a while {} loop
4241 rather than a do {} while loop).
4242
4243 Each entry to WHILEM represents a successful match of A. The count in the
4244 CURLYX block is incremented, another WHILEM state is pushed, and execution
4245 passes to A or B depending on greediness and the current count.
4246
4247 For example, if matching against the string a1a2a3b (where the aN are
4248 substrings that match /A/), then the match progresses as follows: (the
4249 pushed states are interspersed with the bits of strings matched so far):
4250
4251  <CURLYX cnt=-1>
4252  <CURLYX cnt=0><WHILEM>
4253  <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4254  <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4255  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4256  <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4257
4258 (Contrast this with something like CURLYM, which maintains only a single
4259 backtrack state:
4260
4261  <CURLYM cnt=0> a1
4262  a1 <CURLYM cnt=1> a2
4263  a1 a2 <CURLYM cnt=2> a3
4264  a1 a2 a3 <CURLYM cnt=3> b
4265 )
4266
4267 Each WHILEM state block marks a point to backtrack to upon partial failure
4268 of A or B, and also contains some minor state data related to that
4269 iteration.  The CURLYX block, pointed to by cur_curlyx, contains the
4270 overall state, such as the count, and pointers to the A and B ops.
4271
4272 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4273 must always point to the *current* CURLYX block, the rules are:
4274
4275 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4276 and set cur_curlyx to point the new block.
4277
4278 When popping the CURLYX block after a successful or unsuccessful match,
4279 restore the previous cur_curlyx.
4280
4281 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4282 to the outer one saved in the CURLYX block.
4283
4284 When popping the WHILEM block after a successful or unsuccessful B match,
4285 restore the previous cur_curlyx.
4286
4287 Here's an example for the pattern (AI* BI)*BO
4288 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4289
4290 cur_
4291 curlyx backtrack stack
4292 ------ ---------------
4293 NULL
4294 CO     <CO prev=NULL> <WO>
4295 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4296 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4297 NULL   <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4298
4299 At this point the pattern succeeds, and we work back down the stack to
4300 clean up, restoring as we go:
4301
4302 CO     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4303 CI     <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4304 CO     <CO prev=NULL> <WO>
4305 NULL
4306
4307 *******************************************************************/
4308
4309 #define ST st->u.curlyx
4310
4311   case CURLYX:    /* start of /A*B/  (for complex A) */
4312   {
4313    /* No need to save/restore up to this paren */
4314    I32 parenfloor = scan->flags;
4315
4316    assert(next); /* keep Coverity happy */
4317    if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4318     next += ARG(next);
4319
4320    /* XXXX Probably it is better to teach regpush to support
4321    parenfloor > PL_regsize... */
4322    if (parenfloor > (I32)*PL_reglastparen)
4323     parenfloor = *PL_reglastparen; /* Pessimization... */
4324
4325    ST.prev_curlyx= cur_curlyx;
4326    cur_curlyx = st;
4327    ST.cp = PL_savestack_ix;
4328
4329    /* these fields contain the state of the current curly.
4330    * they are accessed by subsequent WHILEMs */
4331    ST.parenfloor = parenfloor;
4332    ST.min = ARG1(scan);
4333    ST.max = ARG2(scan);
4334    ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4335    ST.B = next;
4336    ST.minmod = minmod;
4337    minmod = 0;
4338    ST.count = -1; /* this will be updated by WHILEM */
4339    ST.lastloc = NULL;  /* this will be updated by WHILEM */
4340
4341    PL_reginput = locinput;
4342    PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4343    /* NOTREACHED */
4344   }
4345
4346   case CURLYX_end: /* just finished matching all of A*B */
4347    cur_curlyx = ST.prev_curlyx;
4348    sayYES;
4349    /* NOTREACHED */
4350
4351   case CURLYX_end_fail: /* just failed to match all of A*B */
4352    regcpblow(ST.cp);
4353    cur_curlyx = ST.prev_curlyx;
4354    sayNO;
4355    /* NOTREACHED */
4356
4357
4358 #undef ST
4359 #define ST st->u.whilem
4360
4361   case WHILEM:     /* just matched an A in /A*B/  (for complex A) */
4362   {
4363    /* see the discussion above about CURLYX/WHILEM */
4364    I32 n;
4365    assert(cur_curlyx); /* keep Coverity happy */
4366    n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4367    ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4368    ST.cache_offset = 0;
4369    ST.cache_mask = 0;
4370
4371    PL_reginput = locinput;
4372
4373    DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4374     "%*s  whilem: matched %ld out of %ld..%ld\n",
4375     REPORT_CODE_OFF+depth*2, "", (long)n,
4376     (long)cur_curlyx->u.curlyx.min,
4377     (long)cur_curlyx->u.curlyx.max)
4378    );
4379
4380    /* First just match a string of min A's. */
4381
4382    if (n < cur_curlyx->u.curlyx.min) {
4383     cur_curlyx->u.curlyx.lastloc = locinput;
4384     PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4385     /* NOTREACHED */
4386    }
4387
4388    /* If degenerate A matches "", assume A done. */
4389
4390    if (locinput == cur_curlyx->u.curlyx.lastloc) {
4391     DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4392     "%*s  whilem: empty match detected, trying continuation...\n",
4393     REPORT_CODE_OFF+depth*2, "")
4394     );
4395     goto do_whilem_B_max;
4396    }
4397
4398    /* super-linear cache processing */
4399
4400    if (scan->flags) {
4401
4402     if (!PL_reg_maxiter) {
4403      /* start the countdown: Postpone detection until we
4404      * know the match is not *that* much linear. */
4405      PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4406      /* possible overflow for long strings and many CURLYX's */
4407      if (PL_reg_maxiter < 0)
4408       PL_reg_maxiter = I32_MAX;
4409      PL_reg_leftiter = PL_reg_maxiter;
4410     }
4411
4412     if (PL_reg_leftiter-- == 0) {
4413      /* initialise cache */
4414      const I32 size = (PL_reg_maxiter + 7)/8;
4415      if (PL_reg_poscache) {
4416       if ((I32)PL_reg_poscache_size < size) {
4417        Renew(PL_reg_poscache, size, char);
4418        PL_reg_poscache_size = size;
4419       }
4420       Zero(PL_reg_poscache, size, char);
4421      }
4422      else {
4423       PL_reg_poscache_size = size;
4424       Newxz(PL_reg_poscache, size, char);
4425      }
4426      DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4427  "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4428        PL_colors[4], PL_colors[5])
4429      );
4430     }
4431
4432     if (PL_reg_leftiter < 0) {
4433      /* have we already failed at this position? */
4434      I32 offset, mask;
4435      offset  = (scan->flags & 0xf) - 1
4436         + (locinput - PL_bostr)  * (scan->flags>>4);
4437      mask    = 1 << (offset % 8);
4438      offset /= 8;
4439      if (PL_reg_poscache[offset] & mask) {
4440       DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4441        "%*s  whilem: (cache) already tried at this position...\n",
4442        REPORT_CODE_OFF+depth*2, "")
4443       );
4444       sayNO; /* cache records failure */
4445      }
4446      ST.cache_offset = offset;
4447      ST.cache_mask   = mask;
4448     }
4449    }
4450
4451    /* Prefer B over A for minimal matching. */
4452
4453    if (cur_curlyx->u.curlyx.minmod) {
4454     ST.save_curlyx = cur_curlyx;
4455     cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4456     ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4457     REGCP_SET(ST.lastcp);
4458     PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4459     /* NOTREACHED */
4460    }
4461
4462    /* Prefer A over B for maximal matching. */
4463
4464    if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4465     ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4466     cur_curlyx->u.curlyx.lastloc = locinput;
4467     REGCP_SET(ST.lastcp);
4468     PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4469     /* NOTREACHED */
4470    }
4471    goto do_whilem_B_max;
4472   }
4473   /* NOTREACHED */
4474
4475   case WHILEM_B_min: /* just matched B in a minimal match */
4476   case WHILEM_B_max: /* just matched B in a maximal match */
4477    cur_curlyx = ST.save_curlyx;
4478    sayYES;
4479    /* NOTREACHED */
4480
4481   case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4482    cur_curlyx = ST.save_curlyx;
4483    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4484    cur_curlyx->u.curlyx.count--;
4485    CACHEsayNO;
4486    /* NOTREACHED */
4487
4488   case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4489    REGCP_UNWIND(ST.lastcp);
4490    regcppop(rex);
4491    /* FALL THROUGH */
4492   case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4493    cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4494    cur_curlyx->u.curlyx.count--;
4495    CACHEsayNO;
4496    /* NOTREACHED */
4497
4498   case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4499    REGCP_UNWIND(ST.lastcp);
4500    regcppop(rex); /* Restore some previous $<digit>s? */
4501    PL_reginput = locinput;
4502    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4503     "%*s  whilem: failed, trying continuation...\n",
4504     REPORT_CODE_OFF+depth*2, "")
4505    );
4506   do_whilem_B_max:
4507    if (cur_curlyx->u.curlyx.count >= REG_INFTY
4508     && ckWARN(WARN_REGEXP)
4509     && !(PL_reg_flags & RF_warned))
4510    {
4511     PL_reg_flags |= RF_warned;
4512     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4513      "Complex regular subexpression recursion",
4514      REG_INFTY - 1);
4515    }
4516
4517    /* now try B */
4518    ST.save_curlyx = cur_curlyx;
4519    cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4520    PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4521    /* NOTREACHED */
4522
4523   case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4524    cur_curlyx = ST.save_curlyx;
4525    REGCP_UNWIND(ST.lastcp);
4526    regcppop(rex);
4527
4528    if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4529     /* Maximum greed exceeded */
4530     if (cur_curlyx->u.curlyx.count >= REG_INFTY
4531      && ckWARN(WARN_REGEXP)
4532      && !(PL_reg_flags & RF_warned))
4533     {
4534      PL_reg_flags |= RF_warned;
4535      Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4536       "%s limit (%d) exceeded",
4537       "Complex regular subexpression recursion",
4538       REG_INFTY - 1);
4539     }
4540     cur_curlyx->u.curlyx.count--;
4541     CACHEsayNO;
4542    }
4543
4544    DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4545     "%*s  trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4546    );
4547    /* Try grabbing another A and see if it helps. */
4548    PL_reginput = locinput;
4549    cur_curlyx->u.curlyx.lastloc = locinput;
4550    ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4551    REGCP_SET(ST.lastcp);
4552    PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4553    /* NOTREACHED */
4554
4555 #undef  ST
4556 #define ST st->u.branch
4557
4558   case BRANCHJ:     /*  /(...|A|...)/ with long next pointer */
4559    next = scan + ARG(scan);
4560    if (next == scan)
4561     next = NULL;
4562    scan = NEXTOPER(scan);
4563    /* FALL THROUGH */
4564
4565   case BRANCH:     /*  /(...|A|...)/ */
4566    scan = NEXTOPER(scan); /* scan now points to inner node */
4567    ST.lastparen = *PL_reglastparen;
4568    ST.next_branch = next;
4569    REGCP_SET(ST.cp);
4570    PL_reginput = locinput;
4571
4572    /* Now go into the branch */
4573    if (has_cutgroup) {
4574     PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4575    } else {
4576     PUSH_STATE_GOTO(BRANCH_next, scan);
4577    }
4578    /* NOTREACHED */
4579   case CUTGROUP:
4580    PL_reginput = locinput;
4581    sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4582     MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4583    PUSH_STATE_GOTO(CUTGROUP_next,next);
4584    /* NOTREACHED */
4585   case CUTGROUP_next_fail:
4586    do_cutgroup = 1;
4587    no_final = 1;
4588    if (st->u.mark.mark_name)
4589     sv_commit = st->u.mark.mark_name;
4590    sayNO;
4591    /* NOTREACHED */
4592   case BRANCH_next:
4593    sayYES;
4594    /* NOTREACHED */
4595   case BRANCH_next_fail: /* that branch failed; try the next, if any */
4596    if (do_cutgroup) {
4597     do_cutgroup = 0;
4598     no_final = 0;
4599    }
4600    REGCP_UNWIND(ST.cp);
4601    for (n = *PL_reglastparen; n > ST.lastparen; n--)
4602     PL_regoffs[n].end = -1;
4603    *PL_reglastparen = n;
4604    /*dmq: *PL_reglastcloseparen = n; */
4605    scan = ST.next_branch;
4606    /* no more branches? */
4607    if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4608     DEBUG_EXECUTE_r({
4609      PerlIO_printf( Perl_debug_log,
4610       "%*s  %sBRANCH failed...%s\n",
4611       REPORT_CODE_OFF+depth*2, "",
4612       PL_colors[4],
4613       PL_colors[5] );
4614     });
4615     sayNO_SILENT;
4616    }
4617    continue; /* execute next BRANCH[J] op */
4618    /* NOTREACHED */
4619
4620   case MINMOD:
4621    minmod = 1;
4622    break;
4623
4624 #undef  ST
4625 #define ST st->u.curlym
4626
4627   case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4628
4629    /* This is an optimisation of CURLYX that enables us to push
4630    * only a single backtracking state, no matter how many matches
4631    * there are in {m,n}. It relies on the pattern being constant
4632    * length, with no parens to influence future backrefs
4633    */
4634
4635    ST.me = scan;
4636    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4637
4638    /* if paren positive, emulate an OPEN/CLOSE around A */
4639    if (ST.me->flags) {
4640     U32 paren = ST.me->flags;
4641     if (paren > PL_regsize)
4642      PL_regsize = paren;
4643     if (paren > *PL_reglastparen)
4644      *PL_reglastparen = paren;
4645     scan += NEXT_OFF(scan); /* Skip former OPEN. */
4646    }
4647    ST.A = scan;
4648    ST.B = next;
4649    ST.alen = 0;
4650    ST.count = 0;
4651    ST.minmod = minmod;
4652    minmod = 0;
4653    ST.c1 = CHRTEST_UNINIT;
4654    REGCP_SET(ST.cp);
4655
4656    if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4657     goto curlym_do_B;
4658
4659   curlym_do_A: /* execute the A in /A{m,n}B/  */
4660    PL_reginput = locinput;
4661    PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4662    /* NOTREACHED */
4663
4664   case CURLYM_A: /* we've just matched an A */
4665    locinput = st->locinput;
4666    nextchr = UCHARAT(locinput);
4667
4668    ST.count++;
4669    /* after first match, determine A's length: u.curlym.alen */
4670    if (ST.count == 1) {
4671     if (PL_reg_match_utf8) {
4672      char *s = locinput;
4673      while (s < PL_reginput) {
4674       ST.alen++;
4675       s += UTF8SKIP(s);
4676      }
4677     }
4678     else {
4679      ST.alen = PL_reginput - locinput;
4680     }
4681     if (ST.alen == 0)
4682      ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4683    }
4684    DEBUG_EXECUTE_r(
4685     PerlIO_printf(Perl_debug_log,
4686       "%*s  CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4687       (int)(REPORT_CODE_OFF+(depth*2)), "",
4688       (IV) ST.count, (IV)ST.alen)
4689    );
4690
4691    locinput = PL_reginput;
4692
4693    if (cur_eval && cur_eval->u.eval.close_paren &&
4694     cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4695     goto fake_end;
4696
4697    {
4698     I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4699     if ( max == REG_INFTY || ST.count < max )
4700      goto curlym_do_A; /* try to match another A */
4701    }
4702    goto curlym_do_B; /* try to match B */
4703
4704   case CURLYM_A_fail: /* just failed to match an A */
4705    REGCP_UNWIND(ST.cp);
4706
4707    if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4708     || (cur_eval && cur_eval->u.eval.close_paren &&
4709      cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4710     sayNO;
4711
4712   curlym_do_B: /* execute the B in /A{m,n}B/  */
4713    PL_reginput = locinput;
4714    if (ST.c1 == CHRTEST_UNINIT) {
4715     /* calculate c1 and c2 for possible match of 1st char
4716     * following curly */
4717     ST.c1 = ST.c2 = CHRTEST_VOID;
4718     if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4719      regnode *text_node = ST.B;
4720      if (! HAS_TEXT(text_node))
4721       FIND_NEXT_IMPT(text_node);
4722      /* this used to be
4723
4724       (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4725
4726        But the former is redundant in light of the latter.
4727
4728        if this changes back then the macro for
4729        IS_TEXT and friends need to change.
4730      */
4731      if (PL_regkind[OP(text_node)] == EXACT)
4732      {
4733
4734       ST.c1 = (U8)*STRING(text_node);
4735       ST.c2 =
4736        (IS_TEXTF(text_node))
4737        ? PL_fold[ST.c1]
4738        : (IS_TEXTFL(text_node))
4739         ? PL_fold_locale[ST.c1]
4740         : ST.c1;
4741      }
4742     }
4743    }
4744
4745    DEBUG_EXECUTE_r(
4746     PerlIO_printf(Perl_debug_log,
4747      "%*s  CURLYM trying tail with matches=%"IVdf"...\n",
4748      (int)(REPORT_CODE_OFF+(depth*2)),
4749      "", (IV)ST.count)
4750     );
4751    if (ST.c1 != CHRTEST_VOID
4752      && UCHARAT(PL_reginput) != ST.c1
4753      && UCHARAT(PL_reginput) != ST.c2)
4754    {
4755     /* simulate B failing */
4756     DEBUG_OPTIMISE_r(
4757      PerlIO_printf(Perl_debug_log,
4758       "%*s  CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4759       (int)(REPORT_CODE_OFF+(depth*2)),"",
4760       (IV)ST.c1,(IV)ST.c2
4761     ));
4762     state_num = CURLYM_B_fail;
4763     goto reenter_switch;
4764    }
4765
4766    if (ST.me->flags) {
4767     /* mark current A as captured */
4768     I32 paren = ST.me->flags;
4769     if (ST.count) {
4770      PL_regoffs[paren].start
4771       = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4772      PL_regoffs[paren].end = PL_reginput - PL_bostr;
4773      /*dmq: *PL_reglastcloseparen = paren; */
4774     }
4775     else
4776      PL_regoffs[paren].end = -1;
4777     if (cur_eval && cur_eval->u.eval.close_paren &&
4778      cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4779     {
4780      if (ST.count)
4781       goto fake_end;
4782      else
4783       sayNO;
4784     }
4785    }
4786
4787    PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4788    /* NOTREACHED */
4789
4790   case CURLYM_B_fail: /* just failed to match a B */
4791    REGCP_UNWIND(ST.cp);
4792    if (ST.minmod) {
4793     I32 max = ARG2(ST.me);
4794     if (max != REG_INFTY && ST.count == max)
4795      sayNO;
4796     goto curlym_do_A; /* try to match a further A */
4797    }
4798    /* backtrack one A */
4799    if (ST.count == ARG1(ST.me) /* min */)
4800     sayNO;
4801    ST.count--;
4802    locinput = HOPc(locinput, -ST.alen);
4803    goto curlym_do_B; /* try to match B */
4804
4805 #undef ST
4806 #define ST st->u.curly
4807
4808 #define CURLY_SETPAREN(paren, success) \
4809  if (paren) { \
4810   if (success) { \
4811    PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4812    PL_regoffs[paren].end = locinput - PL_bostr; \
4813    *PL_reglastcloseparen = paren; \
4814   } \
4815   else \
4816    PL_regoffs[paren].end = -1; \
4817  }
4818
4819   case STAR:  /*  /A*B/ where A is width 1 */
4820    ST.paren = 0;
4821    ST.min = 0;
4822    ST.max = REG_INFTY;
4823    scan = NEXTOPER(scan);
4824    goto repeat;
4825   case PLUS:  /*  /A+B/ where A is width 1 */
4826    ST.paren = 0;
4827    ST.min = 1;
4828    ST.max = REG_INFTY;
4829    scan = NEXTOPER(scan);
4830    goto repeat;
4831   case CURLYN:  /*  /(A){m,n}B/ where A is width 1 */
4832    ST.paren = scan->flags; /* Which paren to set */
4833    if (ST.paren > PL_regsize)
4834     PL_regsize = ST.paren;
4835    if (ST.paren > *PL_reglastparen)
4836     *PL_reglastparen = ST.paren;
4837    ST.min = ARG1(scan);  /* min to match */
4838    ST.max = ARG2(scan);  /* max to match */
4839    if (cur_eval && cur_eval->u.eval.close_paren &&
4840     cur_eval->u.eval.close_paren == (U32)ST.paren) {
4841     ST.min=1;
4842     ST.max=1;
4843    }
4844    scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4845    goto repeat;
4846   case CURLY:  /*  /A{m,n}B/ where A is width 1 */
4847    ST.paren = 0;
4848    ST.min = ARG1(scan);  /* min to match */
4849    ST.max = ARG2(scan);  /* max to match */
4850    scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4851   repeat:
4852    /*
4853    * Lookahead to avoid useless match attempts
4854    * when we know what character comes next.
4855    *
4856    * Used to only do .*x and .*?x, but now it allows
4857    * for )'s, ('s and (?{ ... })'s to be in the way
4858    * of the quantifier and the EXACT-like node.  -- japhy
4859    */
4860
4861    if (ST.min > ST.max) /* XXX make this a compile-time check? */
4862     sayNO;
4863    if (HAS_TEXT(next) || JUMPABLE(next)) {
4864     U8 *s;
4865     regnode *text_node = next;
4866
4867     if (! HAS_TEXT(text_node))
4868      FIND_NEXT_IMPT(text_node);
4869
4870     if (! HAS_TEXT(text_node))
4871      ST.c1 = ST.c2 = CHRTEST_VOID;
4872     else {
4873      if ( PL_regkind[OP(text_node)] != EXACT ) {
4874       ST.c1 = ST.c2 = CHRTEST_VOID;
4875       goto assume_ok_easy;
4876      }
4877      else
4878       s = (U8*)STRING(text_node);
4879
4880      /*  Currently we only get here when
4881
4882       PL_rekind[OP(text_node)] == EXACT
4883
4884       if this changes back then the macro for IS_TEXT and
4885       friends need to change. */
4886      if (!UTF) {
4887       ST.c2 = ST.c1 = *s;
4888       if (IS_TEXTF(text_node))
4889        ST.c2 = PL_fold[ST.c1];
4890       else if (IS_TEXTFL(text_node))
4891        ST.c2 = PL_fold_locale[ST.c1];
4892      }
4893      else { /* UTF */
4894       if (IS_TEXTF(text_node)) {
4895        STRLEN ulen1, ulen2;
4896        U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4897        U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4898
4899        to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4900        to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4901 #ifdef EBCDIC
4902        ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4903              ckWARN(WARN_UTF8) ?
4904              0 : UTF8_ALLOW_ANY);
4905        ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4906              ckWARN(WARN_UTF8) ?
4907              0 : UTF8_ALLOW_ANY);
4908 #else
4909        ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4910              uniflags);
4911        ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4912              uniflags);
4913 #endif
4914       }
4915       else {
4916        ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4917              uniflags);
4918       }
4919      }
4920     }
4921    }
4922    else
4923     ST.c1 = ST.c2 = CHRTEST_VOID;
4924   assume_ok_easy:
4925
4926    ST.A = scan;
4927    ST.B = next;
4928    PL_reginput = locinput;
4929    if (minmod) {
4930     minmod = 0;
4931     if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4932      sayNO;
4933     ST.count = ST.min;
4934     locinput = PL_reginput;
4935     REGCP_SET(ST.cp);
4936     if (ST.c1 == CHRTEST_VOID)
4937      goto curly_try_B_min;
4938
4939     ST.oldloc = locinput;
4940
4941     /* set ST.maxpos to the furthest point along the
4942     * string that could possibly match */
4943     if  (ST.max == REG_INFTY) {
4944      ST.maxpos = PL_regeol - 1;
4945      if (do_utf8)
4946       while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4947        ST.maxpos--;
4948     }
4949     else if (do_utf8) {
4950      int m = ST.max - ST.min;
4951      for (ST.maxpos = locinput;
4952       m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4953       ST.maxpos += UTF8SKIP(ST.maxpos);
4954     }
4955     else {
4956      ST.maxpos = locinput + ST.max - ST.min;
4957      if (ST.maxpos >= PL_regeol)
4958       ST.maxpos = PL_regeol - 1;
4959     }
4960     goto curly_try_B_min_known;
4961
4962    }
4963    else {
4964     ST.count = regrepeat(rex, ST.A, ST.max, depth);
4965     locinput = PL_reginput;
4966     if (ST.count < ST.min)
4967      sayNO;
4968     if ((ST.count > ST.min)
4969      && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4970     {
4971      /* A{m,n} must come at the end of the string, there's
4972      * no point in backing off ... */
4973      ST.min = ST.count;
4974      /* ...except that $ and \Z can match before *and* after
4975      newline at the end.  Consider "\n\n" =~ /\n+\Z\n/.
4976      We may back off by one in this case. */
4977      if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4978       ST.min--;
4979     }
4980     REGCP_SET(ST.cp);
4981     goto curly_try_B_max;
4982    }
4983    /* NOTREACHED */
4984
4985
4986   case CURLY_B_min_known_fail:
4987    /* failed to find B in a non-greedy match where c1,c2 valid */
4988    if (ST.paren && ST.count)
4989     PL_regoffs[ST.paren].end = -1;
4990
4991    PL_reginput = locinput; /* Could be reset... */
4992    REGCP_UNWIND(ST.cp);
4993    /* Couldn't or didn't -- move forward. */
4994    ST.oldloc = locinput;
4995    if (do_utf8)
4996     locinput += UTF8SKIP(locinput);
4997    else
4998     locinput++;
4999    ST.count++;
5000   curly_try_B_min_known:
5001    /* find the next place where 'B' could work, then call B */
5002    {
5003     int n;
5004     if (do_utf8) {
5005      n = (ST.oldloc == locinput) ? 0 : 1;
5006      if (ST.c1 == ST.c2) {
5007       STRLEN len;
5008       /* set n to utf8_distance(oldloc, locinput) */
5009       while (locinput <= ST.maxpos &&
5010        utf8n_to_uvchr((U8*)locinput,
5011            UTF8_MAXBYTES, &len,
5012            uniflags) != (UV)ST.c1) {
5013        locinput += len;
5014        n++;
5015       }
5016      }
5017      else {
5018       /* set n to utf8_distance(oldloc, locinput) */
5019       while (locinput <= ST.maxpos) {
5020        STRLEN len;
5021        const UV c = utf8n_to_uvchr((U8*)locinput,
5022             UTF8_MAXBYTES, &len,
5023             uniflags);
5024        if (c == (UV)ST.c1 || c == (UV)ST.c2)
5025         break;
5026        locinput += len;
5027        n++;
5028       }
5029      }
5030     }
5031     else {
5032      if (ST.c1 == ST.c2) {
5033       while (locinput <= ST.maxpos &&
5034        UCHARAT(locinput) != ST.c1)
5035        locinput++;
5036      }
5037      else {
5038       while (locinput <= ST.maxpos
5039        && UCHARAT(locinput) != ST.c1
5040        && UCHARAT(locinput) != ST.c2)
5041        locinput++;
5042      }
5043      n = locinput - ST.oldloc;
5044     }
5045     if (locinput > ST.maxpos)
5046      sayNO;
5047     /* PL_reginput == oldloc now */
5048     if (n) {
5049      ST.count += n;
5050      if (regrepeat(rex, ST.A, n, depth) < n)
5051       sayNO;
5052     }
5053     PL_reginput = locinput;
5054     CURLY_SETPAREN(ST.paren, ST.count);
5055     if (cur_eval && cur_eval->u.eval.close_paren &&
5056      cur_eval->u.eval.close_paren == (U32)ST.paren) {
5057      goto fake_end;
5058     }
5059     PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5060    }
5061    /* NOTREACHED */
5062
5063
5064   case CURLY_B_min_fail:
5065    /* failed to find B in a non-greedy match where c1,c2 invalid */
5066    if (ST.paren && ST.count)
5067     PL_regoffs[ST.paren].end = -1;
5068
5069    REGCP_UNWIND(ST.cp);
5070    /* failed -- move forward one */
5071    PL_reginput = locinput;
5072    if (regrepeat(rex, ST.A, 1, depth)) {
5073     ST.count++;
5074     locinput = PL_reginput;
5075     if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5076       ST.count > 0)) /* count overflow ? */
5077     {
5078     curly_try_B_min:
5079      CURLY_SETPAREN(ST.paren, ST.count);
5080      if (cur_eval && cur_eval->u.eval.close_paren &&
5081       cur_eval->u.eval.close_paren == (U32)ST.paren) {
5082       goto fake_end;
5083      }
5084      PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5085     }
5086    }
5087    sayNO;
5088    /* NOTREACHED */
5089
5090
5091   curly_try_B_max:
5092    /* a successful greedy match: now try to match B */
5093    if (cur_eval && cur_eval->u.eval.close_paren &&
5094     cur_eval->u.eval.close_paren == (U32)ST.paren) {
5095     goto fake_end;
5096    }
5097    {
5098     UV c = 0;
5099     if (ST.c1 != CHRTEST_VOID)
5100      c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
5101           UTF8_MAXBYTES, 0, uniflags)
5102         : (UV) UCHARAT(PL_reginput);
5103     /* If it could work, try it. */
5104     if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5105      CURLY_SETPAREN(ST.paren, ST.count);
5106      PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5107      /* NOTREACHED */
5108     }
5109    }
5110    /* FALL THROUGH */
5111   case CURLY_B_max_fail:
5112    /* failed to find B in a greedy match */
5113    if (ST.paren && ST.count)
5114     PL_regoffs[ST.paren].end = -1;
5115
5116    REGCP_UNWIND(ST.cp);
5117    /*  back up. */
5118    if (--ST.count < ST.min)
5119     sayNO;
5120    PL_reginput = locinput = HOPc(locinput, -1);
5121    goto curly_try_B_max;
5122
5123 #undef ST
5124
5125   case END:
5126    fake_end:
5127    if (cur_eval) {
5128     /* we've just finished A in /(??{A})B/; now continue with B */
5129     I32 tmpix;
5130     st->u.eval.toggle_reg_flags
5131        = cur_eval->u.eval.toggle_reg_flags;
5132     PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5133
5134     st->u.eval.prev_rex = rex_sv;  /* inner */
5135     SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5136     rex = (struct regexp *)SvANY(rex_sv);
5137     rexi = RXi_GET(rex);
5138     cur_curlyx = cur_eval->u.eval.prev_curlyx;
5139     ReREFCNT_inc(rex_sv);
5140     st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
5141
5142     /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5143     PL_reglastparen = &rex->lastparen;
5144     PL_reglastcloseparen = &rex->lastcloseparen;
5145
5146     REGCP_SET(st->u.eval.lastcp);
5147     PL_reginput = locinput;
5148
5149     /* Restore parens of the outer rex without popping the
5150     * savestack */
5151     tmpix = PL_savestack_ix;
5152     PL_savestack_ix = cur_eval->u.eval.lastcp;
5153     regcppop(rex);
5154     PL_savestack_ix = tmpix;
5155
5156     st->u.eval.prev_eval = cur_eval;
5157     cur_eval = cur_eval->u.eval.prev_eval;
5158     DEBUG_EXECUTE_r(
5159      PerlIO_printf(Perl_debug_log, "%*s  EVAL trying tail ... %"UVxf"\n",
5160          REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5161     if ( nochange_depth )
5162      nochange_depth--;
5163
5164     PUSH_YES_STATE_GOTO(EVAL_AB,
5165       st->u.eval.prev_eval->u.eval.B); /* match B */
5166    }
5167
5168    if (locinput < reginfo->till) {
5169     DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5170          "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5171          PL_colors[4],
5172          (long)(locinput - PL_reg_starttry),
5173          (long)(reginfo->till - PL_reg_starttry),
5174          PL_colors[5]));
5175
5176     sayNO_SILENT;  /* Cannot match: too short. */
5177    }
5178    PL_reginput = locinput; /* put where regtry can find it */
5179    sayYES;   /* Success! */
5180
5181   case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5182    DEBUG_EXECUTE_r(
5183    PerlIO_printf(Perl_debug_log,
5184     "%*s  %ssubpattern success...%s\n",
5185     REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5186    PL_reginput = locinput; /* put where regtry can find it */
5187    sayYES;   /* Success! */
5188
5189 #undef  ST
5190 #define ST st->u.ifmatch
5191
5192   case SUSPEND: /* (?>A) */
5193    ST.wanted = 1;
5194    PL_reginput = locinput;
5195    goto do_ifmatch;
5196
5197   case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5198    ST.wanted = 0;
5199    goto ifmatch_trivial_fail_test;
5200
5201   case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5202    ST.wanted = 1;
5203   ifmatch_trivial_fail_test:
5204    if (scan->flags) {
5205     char * const s = HOPBACKc(locinput, scan->flags);
5206     if (!s) {
5207      /* trivial fail */
5208      if (logical) {
5209       logical = 0;
5210       sw = 1 - (bool)ST.wanted;
5211      }
5212      else if (ST.wanted)
5213       sayNO;
5214      next = scan + ARG(scan);
5215      if (next == scan)
5216       next = NULL;
5217      break;
5218     }
5219     PL_reginput = s;
5220    }
5221    else
5222     PL_reginput = locinput;
5223
5224   do_ifmatch:
5225    ST.me = scan;
5226    ST.logical = logical;
5227    logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5228
5229    /* execute body of (?...A) */
5230    PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5231    /* NOTREACHED */
5232
5233   case IFMATCH_A_fail: /* body of (?...A) failed */
5234    ST.wanted = !ST.wanted;
5235    /* FALL THROUGH */
5236
5237   case IFMATCH_A: /* body of (?...A) succeeded */
5238    if (ST.logical) {
5239     sw = (bool)ST.wanted;
5240    }
5241    else if (!ST.wanted)
5242     sayNO;
5243
5244    if (OP(ST.me) == SUSPEND)
5245     locinput = PL_reginput;
5246    else {
5247     locinput = PL_reginput = st->locinput;
5248     nextchr = UCHARAT(locinput);
5249    }
5250    scan = ST.me + ARG(ST.me);
5251    if (scan == ST.me)
5252     scan = NULL;
5253    continue; /* execute B */
5254
5255 #undef ST
5256
5257   case LONGJMP:
5258    next = scan + ARG(scan);
5259    if (next == scan)
5260     next = NULL;
5261    break;
5262   case COMMIT:
5263    reginfo->cutpoint = PL_regeol;
5264    /* FALLTHROUGH */
5265   case PRUNE:
5266    PL_reginput = locinput;
5267    if (!scan->flags)
5268     sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5269    PUSH_STATE_GOTO(COMMIT_next,next);
5270    /* NOTREACHED */
5271   case COMMIT_next_fail:
5272    no_final = 1;
5273    /* FALLTHROUGH */
5274   case OPFAIL:
5275    sayNO;
5276    /* NOTREACHED */
5277
5278 #define ST st->u.mark
5279   case MARKPOINT:
5280    ST.prev_mark = mark_state;
5281    ST.mark_name = sv_commit = sv_yes_mark
5282     = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5283    mark_state = st;
5284    ST.mark_loc = PL_reginput = locinput;
5285    PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5286    /* NOTREACHED */
5287   case MARKPOINT_next:
5288    mark_state = ST.prev_mark;
5289    sayYES;
5290    /* NOTREACHED */
5291   case MARKPOINT_next_fail:
5292    if (popmark && sv_eq(ST.mark_name,popmark))
5293    {
5294     if (ST.mark_loc > startpoint)
5295      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5296     popmark = NULL; /* we found our mark */
5297     sv_commit = ST.mark_name;
5298
5299     DEBUG_EXECUTE_r({
5300       PerlIO_printf(Perl_debug_log,
5301        "%*s  %ssetting cutpoint to mark:%"SVf"...%s\n",
5302        REPORT_CODE_OFF+depth*2, "",
5303        PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5304     });
5305    }
5306    mark_state = ST.prev_mark;
5307    sv_yes_mark = mark_state ?
5308     mark_state->u.mark.mark_name : NULL;
5309    sayNO;
5310    /* NOTREACHED */
5311   case SKIP:
5312    PL_reginput = locinput;
5313    if (scan->flags) {
5314     /* (*SKIP) : if we fail we cut here*/
5315     ST.mark_name = NULL;
5316     ST.mark_loc = locinput;
5317     PUSH_STATE_GOTO(SKIP_next,next);
5318    } else {
5319     /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5320     otherwise do nothing.  Meaning we need to scan
5321     */
5322     regmatch_state *cur = mark_state;
5323     SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5324
5325     while (cur) {
5326      if ( sv_eq( cur->u.mark.mark_name,
5327         find ) )
5328      {
5329       ST.mark_name = find;
5330       PUSH_STATE_GOTO( SKIP_next, next );
5331      }
5332      cur = cur->u.mark.prev_mark;
5333     }
5334    }
5335    /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5336    break;
5337   case SKIP_next_fail:
5338    if (ST.mark_name) {
5339     /* (*CUT:NAME) - Set up to search for the name as we
5340     collapse the stack*/
5341     popmark = ST.mark_name;
5342    } else {
5343     /* (*CUT) - No name, we cut here.*/
5344     if (ST.mark_loc > startpoint)
5345      reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5346     /* but we set sv_commit to latest mark_name if there
5347     is one so they can test to see how things lead to this
5348     cut */
5349     if (mark_state)
5350      sv_commit=mark_state->u.mark.mark_name;
5351    }
5352    no_final = 1;
5353    sayNO;
5354    /* NOTREACHED */
5355 #undef ST
5356   case FOLDCHAR:
5357    n = ARG(scan);
5358    if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5359     locinput += ln;
5360    } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5361     sayNO;
5362    } else  {
5363     U8 folded[UTF8_MAXBYTES_CASE+1];
5364     STRLEN foldlen;
5365     const char * const l = locinput;
5366     char *e = PL_regeol;
5367     to_uni_fold(n, folded, &foldlen);
5368
5369     if (ibcmp_utf8((const char*) folded, 0,  foldlen, 1,
5370        l, &e, 0,  do_utf8)) {
5371       sayNO;
5372     }
5373     locinput = e;
5374    }
5375    nextchr = UCHARAT(locinput);
5376    break;
5377   case LNBREAK:
5378    if ((n=is_LNBREAK(locinput,do_utf8))) {
5379     locinput += n;
5380     nextchr = UCHARAT(locinput);
5381    } else
5382     sayNO;
5383    break;
5384
5385 #define CASE_CLASS(nAmE)                              \
5386   case nAmE:                                    \
5387    if ((n=is_##nAmE(locinput,do_utf8))) {    \
5388     locinput += n;                        \
5389     nextchr = UCHARAT(locinput);          \
5390    } else                                    \
5391     sayNO;                                \
5392    break;                                    \
5393   case N##nAmE:                                 \
5394    if ((n=is_##nAmE(locinput,do_utf8))) {    \
5395     sayNO;                                \
5396    } else {                                  \
5397     locinput += UTF8SKIP(locinput);       \
5398     nextchr = UCHARAT(locinput);          \
5399    }                                         \
5400    break
5401
5402   CASE_CLASS(VERTWS);
5403   CASE_CLASS(HORIZWS);
5404 #undef CASE_CLASS
5405
5406   default:
5407    PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5408       PTR2UV(scan), OP(scan));
5409    Perl_croak(aTHX_ "regexp memory corruption");
5410
5411   } /* end switch */
5412
5413   /* switch break jumps here */
5414   scan = next; /* prepare to execute the next op and ... */
5415   continue;    /* ... jump back to the top, reusing st */
5416   /* NOTREACHED */
5417
5418  push_yes_state:
5419   /* push a state that backtracks on success */
5420   st->u.yes.prev_yes_state = yes_state;
5421   yes_state = st;
5422   /* FALL THROUGH */
5423  push_state:
5424   /* push a new regex state, then continue at scan  */
5425   {
5426    regmatch_state *newst;
5427
5428    DEBUG_STACK_r({
5429     regmatch_state *cur = st;
5430     regmatch_state *curyes = yes_state;
5431     int curd = depth;
5432     regmatch_slab *slab = PL_regmatch_slab;
5433     for (;curd > -1;cur--,curd--) {
5434      if (cur < SLAB_FIRST(slab)) {
5435       slab = slab->prev;
5436       cur = SLAB_LAST(slab);
5437      }
5438      PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5439       REPORT_CODE_OFF + 2 + depth * 2,"",
5440       curd, PL_reg_name[cur->resume_state],
5441       (curyes == cur) ? "yes" : ""
5442      );
5443      if (curyes == cur)
5444       curyes = cur->u.yes.prev_yes_state;
5445     }
5446    } else
5447     DEBUG_STATE_pp("push")
5448    );
5449    depth++;
5450    st->locinput = locinput;
5451    newst = st+1;
5452    if (newst >  SLAB_LAST(PL_regmatch_slab))
5453     newst = S_push_slab(aTHX);
5454    PL_regmatch_state = newst;
5455
5456    locinput = PL_reginput;
5457    nextchr = UCHARAT(locinput);
5458    st = newst;
5459    continue;
5460    /* NOTREACHED */
5461   }
5462  }
5463
5464  /*
5465  * We get here only if there's trouble -- normally "case END" is
5466  * the terminating point.
5467  */
5468  Perl_croak(aTHX_ "corrupted regexp pointers");
5469  /*NOTREACHED*/
5470  sayNO;
5471
5472 yes:
5473  if (yes_state) {
5474   /* we have successfully completed a subexpression, but we must now
5475   * pop to the state marked by yes_state and continue from there */
5476   assert(st != yes_state);
5477 #ifdef DEBUGGING
5478   while (st != yes_state) {
5479    st--;
5480    if (st < SLAB_FIRST(PL_regmatch_slab)) {
5481     PL_regmatch_slab = PL_regmatch_slab->prev;
5482     st = SLAB_LAST(PL_regmatch_slab);
5483    }
5484    DEBUG_STATE_r({
5485     if (no_final) {
5486      DEBUG_STATE_pp("pop (no final)");
5487     } else {
5488      DEBUG_STATE_pp("pop (yes)");
5489     }
5490    });
5491    depth--;
5492   }
5493 #else
5494   while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5495    || yes_state > SLAB_LAST(PL_regmatch_slab))
5496   {
5497    /* not in this slab, pop slab */
5498    depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5499    PL_regmatch_slab = PL_regmatch_slab->prev;
5500    st = SLAB_LAST(PL_regmatch_slab);
5501   }
5502   depth -= (st - yes_state);
5503 #endif
5504   st = yes_state;
5505   yes_state = st->u.yes.prev_yes_state;
5506   PL_regmatch_state = st;
5507
5508   if (no_final) {
5509    locinput= st->locinput;
5510    nextchr = UCHARAT(locinput);
5511   }
5512   state_num = st->resume_state + no_final;
5513   goto reenter_switch;
5514  }
5515
5516  DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5517       PL_colors[4], PL_colors[5]));
5518
5519  if (PL_reg_eval_set) {
5520   /* each successfully executed (?{...}) block does the equivalent of
5521   *   local $^R = do {...}
5522   * When popping the save stack, all these locals would be undone;
5523   * bypass this by setting the outermost saved $^R to the latest
5524   * value */
5525   if (oreplsv != GvSV(PL_replgv))
5526    sv_setsv(oreplsv, GvSV(PL_replgv));
5527  }
5528  result = 1;
5529  goto final_exit;
5530
5531 no:
5532  DEBUG_EXECUTE_r(
5533   PerlIO_printf(Perl_debug_log,
5534    "%*s  %sfailed...%s\n",
5535    REPORT_CODE_OFF+depth*2, "",
5536    PL_colors[4], PL_colors[5])
5537   );
5538
5539 no_silent:
5540  if (no_final) {
5541   if (yes_state) {
5542    goto yes;
5543   } else {
5544    goto final_exit;
5545   }
5546  }
5547  if (depth) {
5548   /* there's a previous state to backtrack to */
5549   st--;
5550   if (st < SLAB_FIRST(PL_regmatch_slab)) {
5551    PL_regmatch_slab = PL_regmatch_slab->prev;
5552    st = SLAB_LAST(PL_regmatch_slab);
5553   }
5554   PL_regmatch_state = st;
5555   locinput= st->locinput;
5556   nextchr = UCHARAT(locinput);
5557
5558   DEBUG_STATE_pp("pop");
5559   depth--;
5560   if (yes_state == st)
5561    yes_state = st->u.yes.prev_yes_state;
5562
5563   state_num = st->resume_state + 1; /* failure = success + 1 */
5564   goto reenter_switch;
5565  }
5566  result = 0;
5567
5568   final_exit:
5569  if (rex->intflags & PREGf_VERBARG_SEEN) {
5570   SV *sv_err = get_sv("REGERROR", 1);
5571   SV *sv_mrk = get_sv("REGMARK", 1);
5572   if (result) {
5573    sv_commit = &PL_sv_no;
5574    if (!sv_yes_mark)
5575     sv_yes_mark = &PL_sv_yes;
5576   } else {
5577    if (!sv_commit)
5578     sv_commit = &PL_sv_yes;
5579    sv_yes_mark = &PL_sv_no;
5580   }
5581   sv_setsv(sv_err, sv_commit);
5582   sv_setsv(sv_mrk, sv_yes_mark);
5583  }
5584
5585  /* clean up; in particular, free all slabs above current one */
5586  LEAVE_SCOPE(oldsave);
5587
5588  return result;
5589 }
5590
5591 /*
5592  - regrepeat - repeatedly match something simple, report how many
5593  */
5594 /*
5595  * [This routine now assumes that it will only match on things of length 1.
5596  * That was true before, but now we assume scan - reginput is the count,
5597  * rather than incrementing count on every character.  [Er, except utf8.]]
5598  */
5599 STATIC I32
5600 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5601 {
5602  dVAR;
5603  register char *scan;
5604  register I32 c;
5605  register char *loceol = PL_regeol;
5606  register I32 hardcount = 0;
5607  register bool do_utf8 = PL_reg_match_utf8;
5608 #ifndef DEBUGGING
5609  PERL_UNUSED_ARG(depth);
5610 #endif
5611
5612  PERL_ARGS_ASSERT_REGREPEAT;
5613
5614  scan = PL_reginput;
5615  if (max == REG_INFTY)
5616   max = I32_MAX;
5617  else if (max < loceol - scan)
5618   loceol = scan + max;
5619  switch (OP(p)) {
5620  case REG_ANY:
5621   if (do_utf8) {
5622    loceol = PL_regeol;
5623    while (scan < loceol && hardcount < max && *scan != '\n') {
5624     scan += UTF8SKIP(scan);
5625     hardcount++;
5626    }
5627   } else {
5628    while (scan < loceol && *scan != '\n')
5629     scan++;
5630   }
5631   break;
5632  case SANY:
5633   if (do_utf8) {
5634    loceol = PL_regeol;
5635    while (scan < loceol && hardcount < max) {
5636     scan += UTF8SKIP(scan);
5637     hardcount++;
5638    }
5639   }
5640   else
5641    scan = loceol;
5642   break;
5643  case CANY:
5644   scan = loceol;
5645   break;
5646  case EXACT:  /* length of string is 1 */
5647   c = (U8)*STRING(p);
5648   while (scan < loceol && UCHARAT(scan) == c)
5649    scan++;
5650   break;
5651  case EXACTF: /* length of string is 1 */
5652   c = (U8)*STRING(p);
5653   while (scan < loceol &&
5654    (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5655    scan++;
5656   break;
5657  case EXACTFL: /* length of string is 1 */
5658   PL_reg_flags |= RF_tainted;
5659   c = (U8)*STRING(p);
5660   while (scan < loceol &&
5661    (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5662    scan++;
5663   break;
5664  case ANYOF:
5665   if (do_utf8) {
5666    loceol = PL_regeol;
5667    while (hardcount < max && scan < loceol &&
5668     reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5669     scan += UTF8SKIP(scan);
5670     hardcount++;
5671    }
5672   } else {
5673    while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5674     scan++;
5675   }
5676   break;
5677  case ALNUM:
5678   if (do_utf8) {
5679    loceol = PL_regeol;
5680    LOAD_UTF8_CHARCLASS_ALNUM();
5681    while (hardcount < max && scan < loceol &&
5682     swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5683     scan += UTF8SKIP(scan);
5684     hardcount++;
5685    }
5686   } else {
5687    while (scan < loceol && isALNUM(*scan))
5688     scan++;
5689   }
5690   break;
5691  case ALNUML:
5692   PL_reg_flags |= RF_tainted;
5693   if (do_utf8) {
5694    loceol = PL_regeol;
5695    while (hardcount < max && scan < loceol &&
5696     isALNUM_LC_utf8((U8*)scan)) {
5697     scan += UTF8SKIP(scan);
5698     hardcount++;
5699    }
5700   } else {
5701    while (scan < loceol && isALNUM_LC(*scan))
5702     scan++;
5703   }
5704   break;
5705  case NALNUM:
5706   if (do_utf8) {
5707    loceol = PL_regeol;
5708    LOAD_UTF8_CHARCLASS_ALNUM();
5709    while (hardcount < max && scan < loceol &&
5710     !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5711     scan += UTF8SKIP(scan);
5712     hardcount++;
5713    }
5714   } else {
5715    while (scan < loceol && !isALNUM(*scan))
5716     scan++;
5717   }
5718   break;
5719  case NALNUML:
5720   PL_reg_flags |= RF_tainted;
5721   if (do_utf8) {
5722    loceol = PL_regeol;
5723    while (hardcount < max && scan < loceol &&
5724     !isALNUM_LC_utf8((U8*)scan)) {
5725     scan += UTF8SKIP(scan);
5726     hardcount++;
5727    }
5728   } else {
5729    while (scan < loceol && !isALNUM_LC(*scan))
5730     scan++;
5731   }
5732   break;
5733  case SPACE:
5734   if (do_utf8) {
5735    loceol = PL_regeol;
5736    LOAD_UTF8_CHARCLASS_SPACE();
5737    while (hardcount < max && scan < loceol &&
5738     (*scan == ' ' ||
5739      swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5740     scan += UTF8SKIP(scan);
5741     hardcount++;
5742    }
5743   } else {
5744    while (scan < loceol && isSPACE(*scan))
5745     scan++;
5746   }
5747   break;
5748  case SPACEL:
5749   PL_reg_flags |= RF_tainted;
5750   if (do_utf8) {
5751    loceol = PL_regeol;
5752    while (hardcount < max && scan < loceol &&
5753     (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5754     scan += UTF8SKIP(scan);
5755     hardcount++;
5756    }
5757   } else {
5758    while (scan < loceol && isSPACE_LC(*scan))
5759     scan++;
5760   }
5761   break;
5762  case NSPACE:
5763   if (do_utf8) {
5764    loceol = PL_regeol;
5765    LOAD_UTF8_CHARCLASS_SPACE();
5766    while (hardcount < max && scan < loceol &&
5767     !(*scan == ' ' ||
5768      swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5769     scan += UTF8SKIP(scan);
5770     hardcount++;
5771    }
5772   } else {
5773    while (scan < loceol && !isSPACE(*scan))
5774     scan++;
5775   }
5776   break;
5777  case NSPACEL:
5778   PL_reg_flags |= RF_tainted;
5779   if (do_utf8) {
5780    loceol = PL_regeol;
5781    while (hardcount < max && scan < loceol &&
5782     !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5783     scan += UTF8SKIP(scan);
5784     hardcount++;
5785    }
5786   } else {
5787    while (scan < loceol && !isSPACE_LC(*scan))
5788     scan++;
5789   }
5790   break;
5791  case DIGIT:
5792   if (do_utf8) {
5793    loceol = PL_regeol;
5794    LOAD_UTF8_CHARCLASS_DIGIT();
5795    while (hardcount < max && scan < loceol &&
5796     swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5797     scan += UTF8SKIP(scan);
5798     hardcount++;
5799    }
5800   } else {
5801    while (scan < loceol && isDIGIT(*scan))
5802     scan++;
5803   }
5804   break;
5805  case NDIGIT:
5806   if (do_utf8) {
5807    loceol = PL_regeol;
5808    LOAD_UTF8_CHARCLASS_DIGIT();
5809    while (hardcount < max && scan < loceol &&
5810     !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5811     scan += UTF8SKIP(scan);
5812     hardcount++;
5813    }
5814   } else {
5815    while (scan < loceol && !isDIGIT(*scan))
5816     scan++;
5817   }
5818  case LNBREAK:
5819   if (do_utf8) {
5820    loceol = PL_regeol;
5821    while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5822     scan += c;
5823     hardcount++;
5824    }
5825   } else {
5826    /*
5827    LNBREAK can match two latin chars, which is ok,
5828    because we have a null terminated string, but we
5829    have to use hardcount in this situation
5830    */
5831    while (scan < loceol && (c=is_LNBREAK_latin1(scan)))  {
5832     scan+=c;
5833     hardcount++;
5834    }
5835   }
5836   break;
5837  case HORIZWS:
5838   if (do_utf8) {
5839    loceol = PL_regeol;
5840    while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5841     scan += c;
5842     hardcount++;
5843    }
5844   } else {
5845    while (scan < loceol && is_HORIZWS_latin1(scan))
5846     scan++;
5847   }
5848   break;
5849  case NHORIZWS:
5850   if (do_utf8) {
5851    loceol = PL_regeol;
5852    while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5853     scan += UTF8SKIP(scan);
5854     hardcount++;
5855    }
5856   } else {
5857    while (scan < loceol && !is_HORIZWS_latin1(scan))
5858     scan++;
5859
5860   }
5861   break;
5862  case VERTWS:
5863   if (do_utf8) {
5864    loceol = PL_regeol;
5865    while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5866     scan += c;
5867     hardcount++;
5868    }
5869   } else {
5870    while (scan < loceol && is_VERTWS_latin1(scan))
5871     scan++;
5872
5873   }
5874   break;
5875  case NVERTWS:
5876   if (do_utf8) {
5877    loceol = PL_regeol;
5878    while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5879     scan += UTF8SKIP(scan);
5880     hardcount++;
5881    }
5882   } else {
5883    while (scan < loceol && !is_VERTWS_latin1(scan))
5884     scan++;
5885
5886   }
5887   break;
5888
5889  default:  /* Called on something of 0 width. */
5890   break;  /* So match right here or not at all. */
5891  }
5892
5893  if (hardcount)
5894   c = hardcount;
5895  else
5896   c = scan - PL_reginput;
5897  PL_reginput = scan;
5898
5899  DEBUG_r({
5900   GET_RE_DEBUG_FLAGS_DECL;
5901   DEBUG_EXECUTE_r({
5902    SV * const prop = sv_newmortal();
5903    regprop(prog, prop, p);
5904    PerlIO_printf(Perl_debug_log,
5905       "%*s  %s can match %"IVdf" times out of %"IVdf"...\n",
5906       REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5907   });
5908  });
5909
5910  return(c);
5911 }
5912
5913
5914 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5915 /*
5916 - regclass_swash - prepare the utf8 swash
5917 */
5918
5919 SV *
5920 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5921 {
5922  dVAR;
5923  SV *sw  = NULL;
5924  SV *si  = NULL;
5925  SV *alt = NULL;
5926  RXi_GET_DECL(prog,progi);
5927  const struct reg_data * const data = prog ? progi->data : NULL;
5928
5929  PERL_ARGS_ASSERT_REGCLASS_SWASH;
5930
5931  if (data && data->count) {
5932   const U32 n = ARG(node);
5933
5934   if (data->what[n] == 's') {
5935    SV * const rv = MUTABLE_SV(data->data[n]);
5936    AV * const av = MUTABLE_AV(SvRV(rv));
5937    SV **const ary = AvARRAY(av);
5938    SV **a, **b;
5939
5940    /* See the end of regcomp.c:S_regclass() for
5941    * documentation of these array elements. */
5942
5943    si = *ary;
5944    a  = SvROK(ary[1]) ? &ary[1] : NULL;
5945    b  = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5946
5947    if (a)
5948     sw = *a;
5949    else if (si && doinit) {
5950     sw = swash_init("utf8", "", si, 1, 0);
5951     (void)av_store(av, 1, sw);
5952    }
5953    if (b)
5954     alt = *b;
5955   }
5956  }
5957
5958  if (listsvp)
5959   *listsvp = si;
5960  if (altsvp)
5961   *altsvp  = alt;
5962
5963  return sw;
5964 }
5965 #endif
5966
5967 /*
5968  - reginclass - determine if a character falls into a character class
5969
5970   The n is the ANYOF regnode, the p is the target string, lenp
5971   is pointer to the maximum length of how far to go in the p
5972   (if the lenp is zero, UTF8SKIP(p) is used),
5973   do_utf8 tells whether the target string is in UTF-8.
5974
5975  */
5976
5977 STATIC bool
5978 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5979 {
5980  dVAR;
5981  const char flags = ANYOF_FLAGS(n);
5982  bool match = FALSE;
5983  UV c = *p;
5984  STRLEN len = 0;
5985  STRLEN plen;
5986
5987  PERL_ARGS_ASSERT_REGINCLASS;
5988
5989  if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5990   c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5991     (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
5992     | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
5993     /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
5994     * UTF8_ALLOW_FFFF */
5995   if (len == (STRLEN)-1)
5996    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5997  }
5998
5999  plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
6000  if (do_utf8 || (flags & ANYOF_UNICODE)) {
6001   if (lenp)
6002    *lenp = 0;
6003   if (do_utf8 && !ANYOF_RUNTIME(n)) {
6004    if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
6005     match = TRUE;
6006   }
6007   if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
6008    match = TRUE;
6009   if (!match) {
6010    AV *av;
6011    SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6012
6013    if (sw) {
6014     U8 * utf8_p;
6015     if (do_utf8) {
6016      utf8_p = (U8 *) p;
6017     } else {
6018      STRLEN len = 1;
6019      utf8_p = bytes_to_utf8(p, &len);
6020     }
6021     if (swash_fetch(sw, utf8_p, 1))
6022      match = TRUE;
6023     else if (flags & ANYOF_FOLD) {
6024      if (!match && lenp && av) {
6025       I32 i;
6026       for (i = 0; i <= av_len(av); i++) {
6027        SV* const sv = *av_fetch(av, i, FALSE);
6028        STRLEN len;
6029        const char * const s = SvPV_const(sv, len);
6030        if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
6031         *lenp = len;
6032         match = TRUE;
6033         break;
6034        }
6035       }
6036      }
6037      if (!match) {
6038       U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
6039
6040       STRLEN tmplen;
6041       to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6042       if (swash_fetch(sw, tmpbuf, 1))
6043        match = TRUE;
6044      }
6045     }
6046
6047     /* If we allocated a string above, free it */
6048     if (! do_utf8) Safefree(utf8_p);
6049    }
6050   }
6051   if (match && lenp && *lenp == 0)
6052    *lenp = UNISKIP(NATIVE_TO_UNI(c));
6053  }
6054  if (!match && c < 256) {
6055   if (ANYOF_BITMAP_TEST(n, c))
6056    match = TRUE;
6057   else if (flags & ANYOF_FOLD) {
6058    U8 f;
6059
6060    if (flags & ANYOF_LOCALE) {
6061     PL_reg_flags |= RF_tainted;
6062     f = PL_fold_locale[c];
6063    }
6064    else
6065     f = PL_fold[c];
6066    if (f != c && ANYOF_BITMAP_TEST(n, f))
6067     match = TRUE;
6068   }
6069
6070   if (!match && (flags & ANYOF_CLASS)) {
6071    PL_reg_flags |= RF_tainted;
6072    if (
6073     (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
6074     (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||
6075     (ANYOF_CLASS_TEST(n, ANYOF_SPACE)   &&  isSPACE_LC(c))  ||
6076     (ANYOF_CLASS_TEST(n, ANYOF_NSPACE)  && !isSPACE_LC(c))  ||
6077     (ANYOF_CLASS_TEST(n, ANYOF_DIGIT)   &&  isDIGIT_LC(c))  ||
6078     (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT)  && !isDIGIT_LC(c))  ||
6079     (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC)  &&  isALNUMC_LC(c)) ||
6080     (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6081     (ANYOF_CLASS_TEST(n, ANYOF_ALPHA)   &&  isALPHA_LC(c))  ||
6082     (ANYOF_CLASS_TEST(n, ANYOF_NALPHA)  && !isALPHA_LC(c))  ||
6083     (ANYOF_CLASS_TEST(n, ANYOF_ASCII)   &&  isASCII(c))     ||
6084     (ANYOF_CLASS_TEST(n, ANYOF_NASCII)  && !isASCII(c))     ||
6085     (ANYOF_CLASS_TEST(n, ANYOF_CNTRL)   &&  isCNTRL_LC(c))  ||
6086     (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL)  && !isCNTRL_LC(c))  ||
6087     (ANYOF_CLASS_TEST(n, ANYOF_GRAPH)   &&  isGRAPH_LC(c))  ||
6088     (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH)  && !isGRAPH_LC(c))  ||
6089     (ANYOF_CLASS_TEST(n, ANYOF_LOWER)   &&  isLOWER_LC(c))  ||
6090     (ANYOF_CLASS_TEST(n, ANYOF_NLOWER)  && !isLOWER_LC(c))  ||
6091     (ANYOF_CLASS_TEST(n, ANYOF_PRINT)   &&  isPRINT_LC(c))  ||
6092     (ANYOF_CLASS_TEST(n, ANYOF_NPRINT)  && !isPRINT_LC(c))  ||
6093     (ANYOF_CLASS_TEST(n, ANYOF_PUNCT)   &&  isPUNCT_LC(c))  ||
6094     (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT)  && !isPUNCT_LC(c))  ||
6095     (ANYOF_CLASS_TEST(n, ANYOF_UPPER)   &&  isUPPER_LC(c))  ||
6096     (ANYOF_CLASS_TEST(n, ANYOF_NUPPER)  && !isUPPER_LC(c))  ||
6097     (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT)  &&  isXDIGIT(c))    ||
6098     (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c))    ||
6099     (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC)  &&  isPSXSPC(c))    ||
6100     (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c))    ||
6101     (ANYOF_CLASS_TEST(n, ANYOF_BLANK)   &&  isBLANK(c))     ||
6102     (ANYOF_CLASS_TEST(n, ANYOF_NBLANK)  && !isBLANK(c))
6103     ) /* How's that for a conditional? */
6104    {
6105     match = TRUE;
6106    }
6107   }
6108  }
6109
6110  return (flags & ANYOF_INVERT) ? !match : match;
6111 }
6112
6113 STATIC U8 *
6114 S_reghop3(U8 *s, I32 off, const U8* lim)
6115 {
6116  dVAR;
6117
6118  PERL_ARGS_ASSERT_REGHOP3;
6119
6120  if (off >= 0) {
6121   while (off-- && s < lim) {
6122    /* XXX could check well-formedness here */
6123    s += UTF8SKIP(s);
6124   }
6125  }
6126  else {
6127   while (off++ && s > lim) {
6128    s--;
6129    if (UTF8_IS_CONTINUED(*s)) {
6130     while (s > lim && UTF8_IS_CONTINUATION(*s))
6131      s--;
6132    }
6133    /* XXX could check well-formedness here */
6134   }
6135  }
6136  return s;
6137 }
6138
6139 #ifdef XXX_dmq
6140 /* there are a bunch of places where we use two reghop3's that should
6141    be replaced with this routine. but since thats not done yet
6142    we ifdef it out - dmq
6143 */
6144 STATIC U8 *
6145 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6146 {
6147  dVAR;
6148
6149  PERL_ARGS_ASSERT_REGHOP4;
6150
6151  if (off >= 0) {
6152   while (off-- && s < rlim) {
6153    /* XXX could check well-formedness here */
6154    s += UTF8SKIP(s);
6155   }
6156  }
6157  else {
6158   while (off++ && s > llim) {
6159    s--;
6160    if (UTF8_IS_CONTINUED(*s)) {
6161     while (s > llim && UTF8_IS_CONTINUATION(*s))
6162      s--;
6163    }
6164    /* XXX could check well-formedness here */
6165   }
6166  }
6167  return s;
6168 }
6169 #endif
6170
6171 STATIC U8 *
6172 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6173 {
6174  dVAR;
6175
6176  PERL_ARGS_ASSERT_REGHOPMAYBE3;
6177
6178  if (off >= 0) {
6179   while (off-- && s < lim) {
6180    /* XXX could check well-formedness here */
6181    s += UTF8SKIP(s);
6182   }
6183   if (off >= 0)
6184    return NULL;
6185  }
6186  else {
6187   while (off++ && s > lim) {
6188    s--;
6189    if (UTF8_IS_CONTINUED(*s)) {
6190     while (s > lim && UTF8_IS_CONTINUATION(*s))
6191      s--;
6192    }
6193    /* XXX could check well-formedness here */
6194   }
6195   if (off <= 0)
6196    return NULL;
6197  }
6198  return s;
6199 }
6200
6201 static void
6202 restore_pos(pTHX_ void *arg)
6203 {
6204  dVAR;
6205  regexp * const rex = (regexp *)arg;
6206  if (PL_reg_eval_set) {
6207   if (PL_reg_oldsaved) {
6208    rex->subbeg = PL_reg_oldsaved;
6209    rex->sublen = PL_reg_oldsavedlen;
6210 #ifdef PERL_OLD_COPY_ON_WRITE
6211    rex->saved_copy = PL_nrs;
6212 #endif
6213    RXp_MATCH_COPIED_on(rex);
6214   }
6215   PL_reg_magic->mg_len = PL_reg_oldpos;
6216   PL_reg_eval_set = 0;
6217   PL_curpm = PL_reg_oldcurpm;
6218  }
6219 }
6220
6221 STATIC void
6222 S_to_utf8_substr(pTHX_ register regexp *prog)
6223 {
6224  int i = 1;
6225
6226  PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6227
6228  do {
6229   if (prog->substrs->data[i].substr
6230    && !prog->substrs->data[i].utf8_substr) {
6231    SV* const sv = newSVsv(prog->substrs->data[i].substr);
6232    prog->substrs->data[i].utf8_substr = sv;
6233    sv_utf8_upgrade(sv);
6234    if (SvVALID(prog->substrs->data[i].substr)) {
6235     const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6236     if (flags & FBMcf_TAIL) {
6237      /* Trim the trailing \n that fbm_compile added last
6238      time.  */
6239      SvCUR_set(sv, SvCUR(sv) - 1);
6240      /* Whilst this makes the SV technically "invalid" (as its
6241      buffer is no longer followed by "\0") when fbm_compile()
6242      adds the "\n" back, a "\0" is restored.  */
6243     }
6244     fbm_compile(sv, flags);
6245    }
6246    if (prog->substrs->data[i].substr == prog->check_substr)
6247     prog->check_utf8 = sv;
6248   }
6249  } while (i--);
6250 }
6251
6252 STATIC void
6253 S_to_byte_substr(pTHX_ register regexp *prog)
6254 {
6255  dVAR;
6256  int i = 1;
6257
6258  PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6259
6260  do {
6261   if (prog->substrs->data[i].utf8_substr
6262    && !prog->substrs->data[i].substr) {
6263    SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6264    if (sv_utf8_downgrade(sv, TRUE)) {
6265     if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6266      const U8 flags
6267       = BmFLAGS(prog->substrs->data[i].utf8_substr);
6268      if (flags & FBMcf_TAIL) {
6269       /* Trim the trailing \n that fbm_compile added last
6270       time.  */
6271       SvCUR_set(sv, SvCUR(sv) - 1);
6272      }
6273      fbm_compile(sv, flags);
6274     }
6275    } else {
6276     SvREFCNT_dec(sv);
6277     sv = &PL_sv_undef;
6278    }
6279    prog->substrs->data[i].substr = sv;
6280    if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6281     prog->check_substr = sv;
6282   }
6283  } while (i--);
6284 }
6285
6286 /*
6287  * Local variables:
6288  * c-indentation-style: bsd
6289  * c-basic-offset: 4
6290  * indent-tabs-mode: t
6291  * End:
6292  *
6293  * ex: set ts=8 sts=4 sw=4 noet:
6294  */