5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 static const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
45 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
51 * pregcomp and pregexec -- regsub and regerror are not used in perl
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGEXEC_C
88 #ifdef PERL_IN_XSUB_RE
94 #include "inline_invlist.c"
95 #include "unicode_constants.h"
97 #define RF_tainted 1 /* tainted information used? e.g. locale */
98 #define RF_warned 2 /* warned about big count? */
100 #define RF_utf8 8 /* Pattern contains multibyte chars? */
102 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
104 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
107 #define STATIC static
110 /* Valid for non-utf8 strings: avoids the reginclass
111 * call if there are no complications: i.e., if everything matchable is
112 * straight forward in the bitmap */
113 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
114 : ANYOF_BITMAP_TEST(p,*(c)))
120 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
121 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
123 #define HOPc(pos,off) \
124 (char *)(PL_reg_match_utf8 \
125 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
127 #define HOPBACKc(pos, off) \
128 (char*)(PL_reg_match_utf8\
129 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
130 : (pos - off >= PL_bostr) \
134 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
135 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
138 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
139 #define NEXTCHR_IS_EOS (nextchr < 0)
141 #define SET_nextchr \
142 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
144 #define SET_locinput(p) \
149 /* these are unrolled below in the CCC_TRY_XXX defined */
150 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
151 if (!CAT2(PL_utf8_,class)) { \
153 ENTER; save_re_context(); \
154 ok=CAT2(is_utf8_,class)((const U8*)str); \
155 PERL_UNUSED_VAR(ok); \
156 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
157 /* Doesn't do an assert to verify that is correct */
158 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
159 if (!CAT2(PL_utf8_,class)) { \
161 PERL_UNUSED_VAR(throw_away); \
162 ENTER; save_re_context(); \
163 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
164 PERL_UNUSED_VAR(throw_away); \
167 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
168 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
170 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
171 /* No asserts are done for some of these, in case called on a */ \
172 /* Unicode version in which they map to nothing */ \
173 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
174 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
176 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
178 /* The actual code for CCC_TRY, which uses several variables from the routine
179 * it's callable from. It is designed to be the bulk of a case statement.
180 * FUNC is the macro or function to call on non-utf8 targets that indicate if
181 * nextchr matches the class.
182 * UTF8_TEST is the whole test string to use for utf8 targets
183 * LOAD is what to use to test, and if not present to load in the swash for the
185 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
187 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
188 * utf8 and a variant, load the swash if necessary and test using the utf8
189 * test. Advance to the next character if test is ok, otherwise fail; If not
190 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
191 * fails, or advance to the next character */
193 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
194 if (NEXTCHR_IS_EOS) { \
197 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
198 LOAD_UTF8_CHARCLASS(CLASS, STR); \
199 if (POS_OR_NEG (UTF8_TEST)) { \
203 else if (POS_OR_NEG (FUNC(nextchr))) { \
206 goto increment_locinput;
208 /* Handle the non-locale cases for a character class and its complement. It
209 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
210 * This is because that code fails when the test succeeds, so we want to have
211 * the test fail so that the code succeeds. The swash is stored in a
212 * predictable PL_ place */
213 #define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
216 _CCC_TRY_CODE( !, FUNC, \
217 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
218 (U8*)locinput, TRUE)), \
221 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
222 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
223 (U8*)locinput, TRUE)), \
225 /* Generate the case statements for both locale and non-locale character
226 * classes in regmatch for classes that don't have special unicode semantics.
227 * Locales don't use an immediate swash, but an intermediary special locale
228 * function that is called on the pointer to the current place in the input
229 * string. That function will resolve to needing the same swash. One might
230 * think that because we don't know what the locale will match, we shouldn't
231 * check with the swash loading function that it loaded properly; ie, that we
232 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
233 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
235 #define CCC_TRY(NAME, NNAME, FUNC, \
236 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
237 NAMEA, NNAMEA, FUNCA, \
240 PL_reg_flags |= RF_tainted; \
241 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
243 PL_reg_flags |= RF_tainted; \
244 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
247 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
250 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
254 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
257 goto increment_locinput; \
258 /* Generate the non-locale cases */ \
259 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
261 /* This is like CCC_TRY, but has an extra set of parameters for generating case
262 * statements to handle separate Unicode semantics nodes */
263 #define CCC_TRY_U(NAME, NNAME, FUNC, \
264 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
265 NAMEU, NNAMEU, FUNCU, \
266 NAMEA, NNAMEA, FUNCA, \
268 CCC_TRY(NAME, NNAME, FUNC, \
269 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
270 NAMEA, NNAMEA, FUNCA, \
272 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
274 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
276 /* for use after a quantifier and before an EXACT-like node -- japhy */
277 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
279 * NOTE that *nothing* that affects backtracking should be in here, specifically
280 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
281 * node that is in between two EXACT like nodes when ascertaining what the required
282 * "follow" character is. This should probably be moved to regex compile time
283 * although it may be done at run time beause of the REF possibility - more
284 * investigation required. -- demerphq
286 #define JUMPABLE(rn) ( \
288 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
290 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
291 OP(rn) == PLUS || OP(rn) == MINMOD || \
293 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
295 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
297 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
300 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
301 we don't need this definition. */
302 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
303 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
304 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
307 /* ... so we use this as its faster. */
308 #define IS_TEXT(rn) ( OP(rn)==EXACT )
309 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
310 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
311 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
316 Search for mandatory following text node; for lookahead, the text must
317 follow but for lookbehind (rn->flags != 0) we skip to the next step.
319 #define FIND_NEXT_IMPT(rn) STMT_START { \
320 while (JUMPABLE(rn)) { \
321 const OPCODE type = OP(rn); \
322 if (type == SUSPEND || PL_regkind[type] == CURLY) \
323 rn = NEXTOPER(NEXTOPER(rn)); \
324 else if (type == PLUS) \
326 else if (type == IFMATCH) \
327 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
328 else rn += NEXT_OFF(rn); \
332 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
333 * These are for the pre-composed Hangul syllables, which are all in a
334 * contiguous block and arranged there in such a way so as to facilitate
335 * alorithmic determination of their characteristics. As such, they don't need
336 * a swash, but can be determined by simple arithmetic. Almost all are
337 * GCB=LVT, but every 28th one is a GCB=LV */
338 #define SBASE 0xAC00 /* Start of block */
339 #define SCount 11172 /* Length of block */
342 static void restore_pos(pTHX_ void *arg);
344 #define REGCP_PAREN_ELEMS 3
345 #define REGCP_OTHER_ELEMS 3
346 #define REGCP_FRAME_ELEMS 1
347 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
348 * are needed for the regexp context stack bookkeeping. */
351 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
354 const int retval = PL_savestack_ix;
355 const int paren_elems_to_push =
356 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
357 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
358 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
360 GET_RE_DEBUG_FLAGS_DECL;
362 PERL_ARGS_ASSERT_REGCPPUSH;
364 if (paren_elems_to_push < 0)
365 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
366 paren_elems_to_push);
368 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
369 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
370 " out of range (%lu-%ld)",
372 (unsigned long)maxopenparen,
375 SSGROW(total_elems + REGCP_FRAME_ELEMS);
378 if ((int)maxopenparen > (int)parenfloor)
379 PerlIO_printf(Perl_debug_log,
380 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
385 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
386 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
387 SSPUSHINT(rex->offs[p].end);
388 SSPUSHINT(rex->offs[p].start);
389 SSPUSHINT(rex->offs[p].start_tmp);
390 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
391 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
393 (IV)rex->offs[p].start,
394 (IV)rex->offs[p].start_tmp,
398 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
399 SSPUSHINT(maxopenparen);
400 SSPUSHINT(rex->lastparen);
401 SSPUSHINT(rex->lastcloseparen);
402 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
407 /* These are needed since we do not localize EVAL nodes: */
408 #define REGCP_SET(cp) \
410 PerlIO_printf(Perl_debug_log, \
411 " Setting an EVAL scope, savestack=%"IVdf"\n", \
412 (IV)PL_savestack_ix)); \
415 #define REGCP_UNWIND(cp) \
417 if (cp != PL_savestack_ix) \
418 PerlIO_printf(Perl_debug_log, \
419 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
420 (IV)(cp), (IV)PL_savestack_ix)); \
423 #define UNWIND_PAREN(lp, lcp) \
424 for (n = rex->lastparen; n > lp; n--) \
425 rex->offs[n].end = -1; \
426 rex->lastparen = n; \
427 rex->lastcloseparen = lcp;
431 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
436 GET_RE_DEBUG_FLAGS_DECL;
438 PERL_ARGS_ASSERT_REGCPPOP;
440 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
442 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
443 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
444 rex->lastcloseparen = SSPOPINT;
445 rex->lastparen = SSPOPINT;
446 *maxopenparen_p = SSPOPINT;
448 i -= REGCP_OTHER_ELEMS;
449 /* Now restore the parentheses context. */
451 if (i || rex->lastparen + 1 <= rex->nparens)
452 PerlIO_printf(Perl_debug_log,
453 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
458 paren = *maxopenparen_p;
459 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
461 rex->offs[paren].start_tmp = SSPOPINT;
462 rex->offs[paren].start = SSPOPINT;
464 if (paren <= rex->lastparen)
465 rex->offs[paren].end = tmps;
466 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
467 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
469 (IV)rex->offs[paren].start,
470 (IV)rex->offs[paren].start_tmp,
471 (IV)rex->offs[paren].end,
472 (paren > rex->lastparen ? "(skipped)" : ""));
477 /* It would seem that the similar code in regtry()
478 * already takes care of this, and in fact it is in
479 * a better location to since this code can #if 0-ed out
480 * but the code in regtry() is needed or otherwise tests
481 * requiring null fields (pat.t#187 and split.t#{13,14}
482 * (as of patchlevel 7877) will fail. Then again,
483 * this code seems to be necessary or otherwise
484 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
485 * --jhi updated by dapm */
486 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
487 if (i > *maxopenparen_p)
488 rex->offs[i].start = -1;
489 rex->offs[i].end = -1;
490 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
491 " \\%"UVuf": %s ..-1 undeffing\n",
493 (i > *maxopenparen_p) ? "-1" : " "
499 /* restore the parens and associated vars at savestack position ix,
500 * but without popping the stack */
503 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
505 I32 tmpix = PL_savestack_ix;
506 PL_savestack_ix = ix;
507 regcppop(rex, maxopenparen_p);
508 PL_savestack_ix = tmpix;
511 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
514 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
516 /* Returns a boolean as to whether or not 'character' is a member of the
517 * Posix character class given by 'classnum' that should be equivalent to a
518 * value in the typedef '_char_class_number'.
520 * Ideally this could be replaced by a just an array of function pointers
521 * to the C library functions that implement the macros this calls.
522 * However, to compile, the precise function signatures are required, and
523 * these may vary from platform to to platform. To avoid having to figure
524 * out what those all are on each platform, I (khw) am using this method,
525 * which adds an extra layer of function call overhead. But we don't
526 * particularly care about performance with locales anyway. */
528 switch ((_char_class_number) classnum) {
529 case _CC_ENUM_ALNUMC: return isALNUMC_LC(character);
530 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
531 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
532 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
533 case _CC_ENUM_LOWER: return isLOWER_LC(character);
534 case _CC_ENUM_PRINT: return isPRINT_LC(character);
535 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
536 case _CC_ENUM_UPPER: return isUPPER_LC(character);
537 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
538 case _CC_ENUM_SPACE: return isSPACE_LC(character);
539 case _CC_ENUM_BLANK: return isBLANK_LC(character);
540 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
541 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
542 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
543 case _CC_ENUM_ASCII: return isASCII_LC(character);
544 default: /* VERTSPACE should never occur in locales */
545 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
548 assert(0); /* NOTREACHED */
553 * pregexec and friends
556 #ifndef PERL_IN_XSUB_RE
558 - pregexec - match a regexp against a string
561 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
562 char *strbeg, I32 minend, SV *screamer, U32 nosave)
563 /* stringarg: the point in the string at which to begin matching */
564 /* strend: pointer to null at end of string */
565 /* strbeg: real beginning of string */
566 /* minend: end of match must be >= minend bytes after stringarg. */
567 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
568 * itself is accessed via the pointers above */
569 /* nosave: For optimizations. */
571 PERL_ARGS_ASSERT_PREGEXEC;
574 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
575 nosave ? 0 : REXEC_COPY_STR);
580 * Need to implement the following flags for reg_anch:
582 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
584 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
585 * INTUIT_AUTORITATIVE_ML
586 * INTUIT_ONCE_NOML - Intuit can match in one location only.
589 * Another flag for this function: SECOND_TIME (so that float substrs
590 * with giant delta may be not rechecked).
593 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
595 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
596 Otherwise, only SvCUR(sv) is used to get strbeg. */
598 /* XXXX We assume that strpos is strbeg unless sv. */
600 /* XXXX Some places assume that there is a fixed substring.
601 An update may be needed if optimizer marks as "INTUITable"
602 RExen without fixed substrings. Similarly, it is assumed that
603 lengths of all the strings are no more than minlen, thus they
604 cannot come from lookahead.
605 (Or minlen should take into account lookahead.)
606 NOTE: Some of this comment is not correct. minlen does now take account
607 of lookahead/behind. Further research is required. -- demerphq
611 /* A failure to find a constant substring means that there is no need to make
612 an expensive call to REx engine, thus we celebrate a failure. Similarly,
613 finding a substring too deep into the string means that less calls to
614 regtry() should be needed.
616 REx compiler's optimizer found 4 possible hints:
617 a) Anchored substring;
619 c) Whether we are anchored (beginning-of-line or \G);
620 d) First node (of those at offset 0) which may distinguish positions;
621 We use a)b)d) and multiline-part of c), and try to find a position in the
622 string which does not contradict any of them.
625 /* Most of decisions we do here should have been done at compile time.
626 The nodes of the REx which we used for the search should have been
627 deleted from the finite automaton. */
630 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
631 char *strend, const U32 flags, re_scream_pos_data *data)
634 struct regexp *const prog = ReANY(rx);
636 /* Should be nonnegative! */
642 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
644 char *other_last = NULL; /* other substr checked before this */
645 char *check_at = NULL; /* check substr found at this pos */
646 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
647 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
648 RXi_GET_DECL(prog,progi);
650 const char * const i_strpos = strpos;
652 GET_RE_DEBUG_FLAGS_DECL;
654 PERL_ARGS_ASSERT_RE_INTUIT_START;
655 PERL_UNUSED_ARG(flags);
656 PERL_UNUSED_ARG(data);
658 RX_MATCH_UTF8_set(rx,utf8_target);
661 PL_reg_flags |= RF_utf8;
664 debug_start_match(rx, utf8_target, strpos, strend,
665 sv ? "Guessing start of match in sv for"
666 : "Guessing start of match in string for");
669 /* CHR_DIST() would be more correct here but it makes things slow. */
670 if (prog->minlen > strend - strpos) {
671 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
672 "String too short... [re_intuit_start]\n"));
676 /* XXX we need to pass strbeg as a separate arg: the following is
677 * guesswork and can be wrong... */
678 if (sv && SvPOK(sv)) {
679 char * p = SvPVX(sv);
680 STRLEN cur = SvCUR(sv);
681 if (p <= strpos && strpos < p + cur) {
683 assert(p <= strend && strend <= p + cur);
686 strbeg = strend - cur;
693 if (!prog->check_utf8 && prog->check_substr)
694 to_utf8_substr(prog);
695 check = prog->check_utf8;
697 if (!prog->check_substr && prog->check_utf8) {
698 if (! to_byte_substr(prog)) {
699 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
702 check = prog->check_substr;
704 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
705 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
706 || ( (prog->extflags & RXf_ANCH_BOL)
707 && !multiline ) ); /* Check after \n? */
710 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
711 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
712 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
714 && (strpos != strbeg)) {
715 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
718 if (prog->check_offset_min == prog->check_offset_max
719 && !(prog->extflags & RXf_CANY_SEEN)
720 && ! multiline) /* /m can cause \n's to match that aren't
721 accounted for in the string max length.
722 See [perl #115242] */
724 /* Substring at constant offset from beg-of-str... */
727 s = HOP3c(strpos, prog->check_offset_min, strend);
730 slen = SvCUR(check); /* >= 1 */
732 if ( strend - s > slen || strend - s < slen - 1
733 || (strend - s == slen && strend[-1] != '\n')) {
734 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
737 /* Now should match s[0..slen-2] */
739 if (slen && (*SvPVX_const(check) != *s
741 && memNE(SvPVX_const(check), s, slen)))) {
743 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
747 else if (*SvPVX_const(check) != *s
748 || ((slen = SvCUR(check)) > 1
749 && memNE(SvPVX_const(check), s, slen)))
752 goto success_at_start;
755 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
757 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
758 end_shift = prog->check_end_shift;
761 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
762 - (SvTAIL(check) != 0);
763 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
765 if (end_shift < eshift)
769 else { /* Can match at random position */
772 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
773 end_shift = prog->check_end_shift;
775 /* end shift should be non negative here */
778 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
780 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
781 (IV)end_shift, RX_PRECOMP(prog));
785 /* Find a possible match in the region s..strend by looking for
786 the "check" substring in the region corrected by start/end_shift. */
789 I32 srch_start_shift = start_shift;
790 I32 srch_end_shift = end_shift;
793 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
794 srch_end_shift -= ((strbeg - s) - srch_start_shift);
795 srch_start_shift = strbeg - s;
797 DEBUG_OPTIMISE_MORE_r({
798 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
799 (IV)prog->check_offset_min,
800 (IV)srch_start_shift,
802 (IV)prog->check_end_shift);
805 if (prog->extflags & RXf_CANY_SEEN) {
806 start_point= (U8*)(s + srch_start_shift);
807 end_point= (U8*)(strend - srch_end_shift);
809 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
810 end_point= HOP3(strend, -srch_end_shift, strbeg);
812 DEBUG_OPTIMISE_MORE_r({
813 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
814 (int)(end_point - start_point),
815 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
819 s = fbm_instr( start_point, end_point,
820 check, multiline ? FBMrf_MULTILINE : 0);
822 /* Update the count-of-usability, remove useless subpatterns,
826 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
827 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
828 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
829 (s ? "Found" : "Did not find"),
830 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
831 ? "anchored" : "floating"),
834 (s ? " at offset " : "...\n") );
839 /* Finish the diagnostic message */
840 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
842 /* XXX dmq: first branch is for positive lookbehind...
843 Our check string is offset from the beginning of the pattern.
844 So we need to do any stclass tests offset forward from that
853 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
854 Start with the other substr.
855 XXXX no SCREAM optimization yet - and a very coarse implementation
856 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
857 *always* match. Probably should be marked during compile...
858 Probably it is right to do no SCREAM here...
861 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
862 : (prog->float_substr && prog->anchored_substr))
864 /* Take into account the "other" substring. */
865 /* XXXX May be hopelessly wrong for UTF... */
868 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
871 char * const last = HOP3c(s, -start_shift, strbeg);
873 char * const saved_s = s;
876 t = s - prog->check_offset_max;
877 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
879 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
884 t = HOP3c(t, prog->anchored_offset, strend);
885 if (t < other_last) /* These positions already checked */
887 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
890 /* XXXX It is not documented what units *_offsets are in.
891 We assume bytes, but this is clearly wrong.
892 Meaning this code needs to be carefully reviewed for errors.
896 /* On end-of-str: see comment below. */
897 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
898 if (must == &PL_sv_undef) {
900 DEBUG_r(must = prog->anchored_utf8); /* for debug */
905 HOP3(HOP3(last1, prog->anchored_offset, strend)
906 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
908 multiline ? FBMrf_MULTILINE : 0
911 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
912 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
913 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
914 (s ? "Found" : "Contradicts"),
915 quoted, RE_SV_TAIL(must));
920 if (last1 >= last2) {
921 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
922 ", giving up...\n"));
925 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
926 ", trying floating at offset %ld...\n",
927 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
928 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
929 s = HOP3c(last, 1, strend);
933 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
934 (long)(s - i_strpos)));
935 t = HOP3c(s, -prog->anchored_offset, strbeg);
936 other_last = HOP3c(s, 1, strend);
944 else { /* Take into account the floating substring. */
946 char * const saved_s = s;
949 t = HOP3c(s, -start_shift, strbeg);
951 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
952 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
953 last = HOP3c(t, prog->float_max_offset, strend);
954 s = HOP3c(t, prog->float_min_offset, strend);
957 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
958 must = utf8_target ? prog->float_utf8 : prog->float_substr;
959 /* fbm_instr() takes into account exact value of end-of-str
960 if the check is SvTAIL(ed). Since false positives are OK,
961 and end-of-str is not later than strend we are OK. */
962 if (must == &PL_sv_undef) {
964 DEBUG_r(must = prog->float_utf8); /* for debug message */
967 s = fbm_instr((unsigned char*)s,
968 (unsigned char*)last + SvCUR(must)
970 must, multiline ? FBMrf_MULTILINE : 0);
972 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
973 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
974 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
975 (s ? "Found" : "Contradicts"),
976 quoted, RE_SV_TAIL(must));
980 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
981 ", giving up...\n"));
984 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
985 ", trying anchored starting at offset %ld...\n",
986 (long)(saved_s + 1 - i_strpos)));
988 s = HOP3c(t, 1, strend);
992 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
993 (long)(s - i_strpos)));
994 other_last = s; /* Fix this later. --Hugo */
1004 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1006 DEBUG_OPTIMISE_MORE_r(
1007 PerlIO_printf(Perl_debug_log,
1008 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
1009 (IV)prog->check_offset_min,
1010 (IV)prog->check_offset_max,
1018 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1020 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1023 /* Fixed substring is found far enough so that the match
1024 cannot start at strpos. */
1026 if (ml_anch && t[-1] != '\n') {
1027 /* Eventually fbm_*() should handle this, but often
1028 anchored_offset is not 0, so this check will not be wasted. */
1029 /* XXXX In the code below we prefer to look for "^" even in
1030 presence of anchored substrings. And we search even
1031 beyond the found float position. These pessimizations
1032 are historical artefacts only. */
1034 while (t < strend - prog->minlen) {
1036 if (t < check_at - prog->check_offset_min) {
1037 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1038 /* Since we moved from the found position,
1039 we definitely contradict the found anchored
1040 substr. Due to the above check we do not
1041 contradict "check" substr.
1042 Thus we can arrive here only if check substr
1043 is float. Redo checking for "other"=="fixed".
1046 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1047 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1048 goto do_other_anchored;
1050 /* We don't contradict the found floating substring. */
1051 /* XXXX Why not check for STCLASS? */
1053 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1054 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1057 /* Position contradicts check-string */
1058 /* XXXX probably better to look for check-string
1059 than for "\n", so one should lower the limit for t? */
1060 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1061 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1062 other_last = strpos = s = t + 1;
1067 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1068 PL_colors[0], PL_colors[1]));
1072 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1073 PL_colors[0], PL_colors[1]));
1077 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1080 /* The found string does not prohibit matching at strpos,
1081 - no optimization of calling REx engine can be performed,
1082 unless it was an MBOL and we are not after MBOL,
1083 or a future STCLASS check will fail this. */
1085 /* Even in this situation we may use MBOL flag if strpos is offset
1086 wrt the start of the string. */
1087 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1088 && (strpos != strbeg) && strpos[-1] != '\n'
1089 /* May be due to an implicit anchor of m{.*foo} */
1090 && !(prog->intflags & PREGf_IMPLICIT))
1095 DEBUG_EXECUTE_r( if (ml_anch)
1096 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1097 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1100 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1102 prog->check_utf8 /* Could be deleted already */
1103 && --BmUSEFUL(prog->check_utf8) < 0
1104 && (prog->check_utf8 == prog->float_utf8)
1106 prog->check_substr /* Could be deleted already */
1107 && --BmUSEFUL(prog->check_substr) < 0
1108 && (prog->check_substr == prog->float_substr)
1111 /* If flags & SOMETHING - do not do it many times on the same match */
1112 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1113 /* XXX Does the destruction order has to change with utf8_target? */
1114 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1115 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1116 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1117 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1118 check = NULL; /* abort */
1120 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1121 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1122 if (prog->intflags & PREGf_IMPLICIT)
1123 prog->extflags &= ~RXf_ANCH_MBOL;
1124 /* XXXX This is a remnant of the old implementation. It
1125 looks wasteful, since now INTUIT can use many
1126 other heuristics. */
1127 prog->extflags &= ~RXf_USE_INTUIT;
1128 /* XXXX What other flags might need to be cleared in this branch? */
1134 /* Last resort... */
1135 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1136 /* trie stclasses are too expensive to use here, we are better off to
1137 leave it to regmatch itself */
1138 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1139 /* minlen == 0 is possible if regstclass is \b or \B,
1140 and the fixed substr is ''$.
1141 Since minlen is already taken into account, s+1 is before strend;
1142 accidentally, minlen >= 1 guaranties no false positives at s + 1
1143 even for \b or \B. But (minlen? 1 : 0) below assumes that
1144 regstclass does not come from lookahead... */
1145 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1146 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1147 const U8* const str = (U8*)STRING(progi->regstclass);
1148 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1149 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1152 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1153 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1154 else if (prog->float_substr || prog->float_utf8)
1155 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1159 if (checked_upto < s)
1161 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1162 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1165 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1170 const char *what = NULL;
1172 if (endpos == strend) {
1173 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1174 "Could not match STCLASS...\n") );
1177 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1178 "This position contradicts STCLASS...\n") );
1179 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1181 checked_upto = HOPBACKc(endpos, start_shift);
1182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1183 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1184 /* Contradict one of substrings */
1185 if (prog->anchored_substr || prog->anchored_utf8) {
1186 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1187 DEBUG_EXECUTE_r( what = "anchored" );
1189 s = HOP3c(t, 1, strend);
1190 if (s + start_shift + end_shift > strend) {
1191 /* XXXX Should be taken into account earlier? */
1192 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1193 "Could not match STCLASS...\n") );
1198 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1199 "Looking for %s substr starting at offset %ld...\n",
1200 what, (long)(s + start_shift - i_strpos)) );
1203 /* Have both, check_string is floating */
1204 if (t + start_shift >= check_at) /* Contradicts floating=check */
1205 goto retry_floating_check;
1206 /* Recheck anchored substring, but not floating... */
1210 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1211 "Looking for anchored substr starting at offset %ld...\n",
1212 (long)(other_last - i_strpos)) );
1213 goto do_other_anchored;
1215 /* Another way we could have checked stclass at the
1216 current position only: */
1221 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1222 "Looking for /%s^%s/m starting at offset %ld...\n",
1223 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1226 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1228 /* Check is floating substring. */
1229 retry_floating_check:
1230 t = check_at - start_shift;
1231 DEBUG_EXECUTE_r( what = "floating" );
1232 goto hop_and_restart;
1235 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1236 "By STCLASS: moving %ld --> %ld\n",
1237 (long)(t - i_strpos), (long)(s - i_strpos))
1241 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1242 "Does not contradict STCLASS...\n");
1247 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1248 PL_colors[4], (check ? "Guessed" : "Giving up"),
1249 PL_colors[5], (long)(s - i_strpos)) );
1252 fail_finish: /* Substring not found */
1253 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1254 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1256 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1257 PL_colors[4], PL_colors[5]));
1261 #define DECL_TRIE_TYPE(scan) \
1262 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1263 trie_type = ((scan->flags == EXACT) \
1264 ? (utf8_target ? trie_utf8 : trie_plain) \
1265 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1267 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1268 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1270 switch (trie_type) { \
1271 case trie_utf8_fold: \
1272 if ( foldlen>0 ) { \
1273 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1278 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1279 len = UTF8SKIP(uc); \
1280 skiplen = UNISKIP( uvc ); \
1281 foldlen -= skiplen; \
1282 uscan = foldbuf + skiplen; \
1285 case trie_latin_utf8_fold: \
1286 if ( foldlen>0 ) { \
1287 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1293 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1294 skiplen = UNISKIP( uvc ); \
1295 foldlen -= skiplen; \
1296 uscan = foldbuf + skiplen; \
1300 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1307 charid = trie->charmap[ uvc ]; \
1311 if (widecharmap) { \
1312 SV** const svpp = hv_fetch(widecharmap, \
1313 (char*)&uvc, sizeof(UV), 0); \
1315 charid = (U16)SvIV(*svpp); \
1320 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1324 && (ln == 1 || folder(s, pat_string, ln)) \
1325 && (!reginfo || regtry(reginfo, &s)) ) \
1331 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1333 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
1339 #define REXEC_FBC_SCAN(CoDe) \
1341 while (s < strend) { \
1347 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1348 REXEC_FBC_UTF8_SCAN( \
1350 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1359 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1362 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1371 #define REXEC_FBC_TRYIT \
1372 if ((!reginfo || regtry(reginfo, &s))) \
1375 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1376 if (utf8_target) { \
1377 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1380 REXEC_FBC_CLASS_SCAN(CoNd); \
1383 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1384 if (utf8_target) { \
1386 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1389 REXEC_FBC_CLASS_SCAN(CoNd); \
1392 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1393 PL_reg_flags |= RF_tainted; \
1394 if (utf8_target) { \
1395 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1398 REXEC_FBC_CLASS_SCAN(CoNd); \
1401 #define DUMP_EXEC_POS(li,s,doutf8) \
1402 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1405 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1406 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1407 tmp = TEST_NON_UTF8(tmp); \
1408 REXEC_FBC_UTF8_SCAN( \
1409 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1418 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1419 if (s == PL_bostr) { \
1423 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1424 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1427 LOAD_UTF8_CHARCLASS_ALNUM(); \
1428 REXEC_FBC_UTF8_SCAN( \
1429 if (tmp == ! (TeSt2_UtF8)) { \
1438 /* The only difference between the BOUND and NBOUND cases is that
1439 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1440 * NBOUND. This is accomplished by passing it in either the if or else clause,
1441 * with the other one being empty */
1442 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1443 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1445 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1446 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1448 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1449 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1451 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1452 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1455 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1456 * be passed in completely with the variable name being tested, which isn't
1457 * such a clean interface, but this is easier to read than it was before. We
1458 * are looking for the boundary (or non-boundary between a word and non-word
1459 * character. The utf8 and non-utf8 cases have the same logic, but the details
1460 * must be different. Find the "wordness" of the character just prior to this
1461 * one, and compare it with the wordness of this one. If they differ, we have
1462 * a boundary. At the beginning of the string, pretend that the previous
1463 * character was a new-line */
1464 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1465 if (utf8_target) { \
1468 else { /* Not utf8 */ \
1469 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1470 tmp = TEST_NON_UTF8(tmp); \
1472 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1481 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1484 /* We know what class REx starts with. Try to find this position... */
1485 /* if reginfo is NULL, its a dryrun */
1486 /* annoyingly all the vars in this routine have different names from their counterparts
1487 in regmatch. /grrr */
1490 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1491 const char *strend, regmatch_info *reginfo)
1494 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1495 char *pat_string; /* The pattern's exactish string */
1496 char *pat_end; /* ptr to end char of pat_string */
1497 re_fold_t folder; /* Function for computing non-utf8 folds */
1498 const U8 *fold_array; /* array for folding ords < 256 */
1505 I32 tmp = 1; /* Scratch variable? */
1506 const bool utf8_target = PL_reg_match_utf8;
1507 UV utf8_fold_flags = 0;
1508 RXi_GET_DECL(prog,progi);
1510 PERL_ARGS_ASSERT_FIND_BYCLASS;
1512 /* We know what class it must start with. */
1516 REXEC_FBC_UTF8_CLASS_SCAN(
1517 reginclass(prog, c, (U8*)s, utf8_target));
1520 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1525 if (tmp && (!reginfo || regtry(reginfo, &s)))
1533 if (UTF_PATTERN || utf8_target) {
1534 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1535 goto do_exactf_utf8;
1537 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1538 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1539 goto do_exactf_non_utf8; /* isn't dealt with by these */
1544 /* regcomp.c already folded this if pattern is in UTF-8 */
1545 utf8_fold_flags = 0;
1546 goto do_exactf_utf8;
1548 fold_array = PL_fold;
1550 goto do_exactf_non_utf8;
1553 if (UTF_PATTERN || utf8_target) {
1554 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1555 goto do_exactf_utf8;
1557 fold_array = PL_fold_locale;
1558 folder = foldEQ_locale;
1559 goto do_exactf_non_utf8;
1563 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1565 goto do_exactf_utf8;
1567 case EXACTFU_TRICKYFOLD:
1569 if (UTF_PATTERN || utf8_target) {
1570 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1571 goto do_exactf_utf8;
1574 /* Any 'ss' in the pattern should have been replaced by regcomp,
1575 * so we don't have to worry here about this single special case
1576 * in the Latin1 range */
1577 fold_array = PL_fold_latin1;
1578 folder = foldEQ_latin1;
1582 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1583 are no glitches with fold-length differences
1584 between the target string and pattern */
1586 /* The idea in the non-utf8 EXACTF* cases is to first find the
1587 * first character of the EXACTF* node and then, if necessary,
1588 * case-insensitively compare the full text of the node. c1 is the
1589 * first character. c2 is its fold. This logic will not work for
1590 * Unicode semantics and the german sharp ss, which hence should
1591 * not be compiled into a node that gets here. */
1592 pat_string = STRING(c);
1593 ln = STR_LEN(c); /* length to match in octets/bytes */
1595 /* We know that we have to match at least 'ln' bytes (which is the
1596 * same as characters, since not utf8). If we have to match 3
1597 * characters, and there are only 2 availabe, we know without
1598 * trying that it will fail; so don't start a match past the
1599 * required minimum number from the far end */
1600 e = HOP3c(strend, -((I32)ln), s);
1602 if (!reginfo && e < s) {
1603 e = s; /* Due to minlen logic of intuit() */
1607 c2 = fold_array[c1];
1608 if (c1 == c2) { /* If char and fold are the same */
1609 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1612 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1620 /* If one of the operands is in utf8, we can't use the simpler folding
1621 * above, due to the fact that many different characters can have the
1622 * same fold, or portion of a fold, or different- length fold */
1623 pat_string = STRING(c);
1624 ln = STR_LEN(c); /* length to match in octets/bytes */
1625 pat_end = pat_string + ln;
1626 lnc = (UTF_PATTERN) /* length to match in characters */
1627 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1630 /* We have 'lnc' characters to match in the pattern, but because of
1631 * multi-character folding, each character in the target can match
1632 * up to 3 characters (Unicode guarantees it will never exceed
1633 * this) if it is utf8-encoded; and up to 2 if not (based on the
1634 * fact that the Latin 1 folds are already determined, and the
1635 * only multi-char fold in that range is the sharp-s folding to
1636 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1637 * string character. Adjust lnc accordingly, rounding up, so that
1638 * if we need to match at least 4+1/3 chars, that really is 5. */
1639 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1640 lnc = (lnc + expansion - 1) / expansion;
1642 /* As in the non-UTF8 case, if we have to match 3 characters, and
1643 * only 2 are left, it's guaranteed to fail, so don't start a
1644 * match that would require us to go beyond the end of the string
1646 e = HOP3c(strend, -((I32)lnc), s);
1648 if (!reginfo && e < s) {
1649 e = s; /* Due to minlen logic of intuit() */
1652 /* XXX Note that we could recalculate e to stop the loop earlier,
1653 * as the worst case expansion above will rarely be met, and as we
1654 * go along we would usually find that e moves further to the left.
1655 * This would happen only after we reached the point in the loop
1656 * where if there were no expansion we should fail. Unclear if
1657 * worth the expense */
1660 char *my_strend= (char *)strend;
1661 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1662 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1663 && (!reginfo || regtry(reginfo, &s)) )
1667 s += (utf8_target) ? UTF8SKIP(s) : 1;
1672 PL_reg_flags |= RF_tainted;
1673 FBC_BOUND(isALNUM_LC,
1674 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1675 isALNUM_LC_utf8((U8*)s));
1678 PL_reg_flags |= RF_tainted;
1679 FBC_NBOUND(isALNUM_LC,
1680 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1681 isALNUM_LC_utf8((U8*)s));
1684 FBC_BOUND(isWORDCHAR,
1686 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1689 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1691 isWORDCHAR_A((U8*)s));
1694 FBC_NBOUND(isWORDCHAR,
1696 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1699 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1701 isWORDCHAR_A((U8*)s));
1704 FBC_BOUND(isWORDCHAR_L1,
1706 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1709 FBC_NBOUND(isWORDCHAR_L1,
1711 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1714 REXEC_FBC_CSCAN_TAINT(
1715 isALNUM_LC_utf8((U8*)s),
1720 REXEC_FBC_CSCAN_PRELOAD(
1721 LOAD_UTF8_CHARCLASS_ALNUM(),
1722 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1723 isWORDCHAR_L1((U8) *s)
1727 REXEC_FBC_CSCAN_PRELOAD(
1728 LOAD_UTF8_CHARCLASS_ALNUM(),
1729 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1734 /* Don't need to worry about utf8, as it can match only a single
1735 * byte invariant character */
1736 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1739 REXEC_FBC_CSCAN_PRELOAD(
1740 LOAD_UTF8_CHARCLASS_ALNUM(),
1741 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1742 ! isWORDCHAR_L1((U8) *s)
1746 REXEC_FBC_CSCAN_PRELOAD(
1747 LOAD_UTF8_CHARCLASS_ALNUM(),
1748 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1759 REXEC_FBC_CSCAN_TAINT(
1760 !isALNUM_LC_utf8((U8*)s),
1766 is_XPERLSPACE_utf8(s),
1772 is_XPERLSPACE_utf8(s),
1777 /* Don't need to worry about utf8, as it can match only a single
1778 * byte invariant character */
1779 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1782 REXEC_FBC_CSCAN_TAINT(
1783 isSPACE_LC_utf8((U8*)s),
1789 ! is_XPERLSPACE_utf8(s),
1790 ! isSPACE_L1((U8) *s)
1795 ! is_XPERLSPACE_utf8(s),
1806 REXEC_FBC_CSCAN_TAINT(
1807 !isSPACE_LC_utf8((U8*)s),
1812 REXEC_FBC_CSCAN_PRELOAD(
1813 LOAD_UTF8_CHARCLASS_DIGIT(),
1814 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1819 /* Don't need to worry about utf8, as it can match only a single
1820 * byte invariant character */
1821 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1824 REXEC_FBC_CSCAN_TAINT(
1825 isDIGIT_LC_utf8((U8*)s),
1830 REXEC_FBC_CSCAN_PRELOAD(
1831 LOAD_UTF8_CHARCLASS_DIGIT(),
1832 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1843 REXEC_FBC_CSCAN_TAINT(
1844 !isDIGIT_LC_utf8((U8*)s),
1849 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1850 is_LNBREAK_latin1_safe(s, strend)
1855 is_VERTWS_utf8_safe(s, strend),
1856 is_VERTWS_latin1_safe(s, strend)
1861 !is_VERTWS_utf8_safe(s, strend),
1862 !is_VERTWS_latin1_safe(s, strend)
1867 is_HORIZWS_utf8_safe(s, strend),
1868 is_HORIZWS_latin1_safe(s, strend)
1873 !is_HORIZWS_utf8_safe(s, strend),
1874 !is_HORIZWS_latin1_safe(s, strend)
1878 /* Don't need to worry about utf8, as it can match only a single
1879 * byte invariant character. The flag in this node type is the
1880 * class number to pass to _generic_isCC() to build a mask for
1881 * searching in PL_charclass[] */
1882 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1886 !_generic_isCC_A(*s, FLAGS(c)),
1887 !_generic_isCC_A(*s, FLAGS(c))
1895 /* what trie are we using right now */
1896 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1897 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1898 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1900 const char *last_start = strend - trie->minlen;
1902 const char *real_start = s;
1904 STRLEN maxlen = trie->maxlen;
1906 U8 **points; /* map of where we were in the input string
1907 when reading a given char. For ASCII this
1908 is unnecessary overhead as the relationship
1909 is always 1:1, but for Unicode, especially
1910 case folded Unicode this is not true. */
1911 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1915 GET_RE_DEBUG_FLAGS_DECL;
1917 /* We can't just allocate points here. We need to wrap it in
1918 * an SV so it gets freed properly if there is a croak while
1919 * running the match */
1922 sv_points=newSV(maxlen * sizeof(U8 *));
1923 SvCUR_set(sv_points,
1924 maxlen * sizeof(U8 *));
1925 SvPOK_on(sv_points);
1926 sv_2mortal(sv_points);
1927 points=(U8**)SvPV_nolen(sv_points );
1928 if ( trie_type != trie_utf8_fold
1929 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1932 bitmap=(U8*)trie->bitmap;
1934 bitmap=(U8*)ANYOF_BITMAP(c);
1936 /* this is the Aho-Corasick algorithm modified a touch
1937 to include special handling for long "unknown char" sequences.
1938 The basic idea being that we use AC as long as we are dealing
1939 with a possible matching char, when we encounter an unknown char
1940 (and we have not encountered an accepting state) we scan forward
1941 until we find a legal starting char.
1942 AC matching is basically that of trie matching, except that when
1943 we encounter a failing transition, we fall back to the current
1944 states "fail state", and try the current char again, a process
1945 we repeat until we reach the root state, state 1, or a legal
1946 transition. If we fail on the root state then we can either
1947 terminate if we have reached an accepting state previously, or
1948 restart the entire process from the beginning if we have not.
1951 while (s <= last_start) {
1952 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1960 U8 *uscan = (U8*)NULL;
1961 U8 *leftmost = NULL;
1963 U32 accepted_word= 0;
1967 while ( state && uc <= (U8*)strend ) {
1969 U32 word = aho->states[ state ].wordnum;
1973 DEBUG_TRIE_EXECUTE_r(
1974 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1975 dump_exec_pos( (char *)uc, c, strend, real_start,
1976 (char *)uc, utf8_target );
1977 PerlIO_printf( Perl_debug_log,
1978 " Scanning for legal start char...\n");
1982 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1986 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1992 if (uc >(U8*)last_start) break;
1996 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1997 if (!leftmost || lpos < leftmost) {
1998 DEBUG_r(accepted_word=word);
2004 points[pointpos++ % maxlen]= uc;
2005 if (foldlen || uc < (U8*)strend) {
2006 REXEC_TRIE_READ_CHAR(trie_type, trie,
2008 uscan, len, uvc, charid, foldlen,
2010 DEBUG_TRIE_EXECUTE_r({
2011 dump_exec_pos( (char *)uc, c, strend,
2012 real_start, s, utf8_target);
2013 PerlIO_printf(Perl_debug_log,
2014 " Charid:%3u CP:%4"UVxf" ",
2026 word = aho->states[ state ].wordnum;
2028 base = aho->states[ state ].trans.base;
2030 DEBUG_TRIE_EXECUTE_r({
2032 dump_exec_pos( (char *)uc, c, strend, real_start,
2034 PerlIO_printf( Perl_debug_log,
2035 "%sState: %4"UVxf", word=%"UVxf,
2036 failed ? " Fail transition to " : "",
2037 (UV)state, (UV)word);
2043 ( ((offset = base + charid
2044 - 1 - trie->uniquecharcount)) >= 0)
2045 && ((U32)offset < trie->lasttrans)
2046 && trie->trans[offset].check == state
2047 && (tmp=trie->trans[offset].next))
2049 DEBUG_TRIE_EXECUTE_r(
2050 PerlIO_printf( Perl_debug_log," - legal\n"));
2055 DEBUG_TRIE_EXECUTE_r(
2056 PerlIO_printf( Perl_debug_log," - fail\n"));
2058 state = aho->fail[state];
2062 /* we must be accepting here */
2063 DEBUG_TRIE_EXECUTE_r(
2064 PerlIO_printf( Perl_debug_log," - accepting\n"));
2073 if (!state) state = 1;
2076 if ( aho->states[ state ].wordnum ) {
2077 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2078 if (!leftmost || lpos < leftmost) {
2079 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2084 s = (char*)leftmost;
2085 DEBUG_TRIE_EXECUTE_r({
2087 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2088 (UV)accepted_word, (IV)(s - real_start)
2091 if (!reginfo || regtry(reginfo, &s)) {
2097 DEBUG_TRIE_EXECUTE_r({
2098 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2101 DEBUG_TRIE_EXECUTE_r(
2102 PerlIO_printf( Perl_debug_log,"No match.\n"));
2111 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2121 - regexec_flags - match a regexp against a string
2124 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2125 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2126 /* stringarg: the point in the string at which to begin matching */
2127 /* strend: pointer to null at end of string */
2128 /* strbeg: real beginning of string */
2129 /* minend: end of match must be >= minend bytes after stringarg. */
2130 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2131 * itself is accessed via the pointers above */
2132 /* data: May be used for some additional optimizations.
2133 Currently its only used, with a U32 cast, for transmitting
2134 the ganch offset when doing a /g match. This will change */
2135 /* nosave: For optimizations. */
2139 struct regexp *const prog = ReANY(rx);
2142 char *startpos = stringarg;
2143 I32 minlen; /* must match at least this many chars */
2144 I32 dontbother = 0; /* how many characters not to try at end */
2145 I32 end_shift = 0; /* Same for the end. */ /* CC */
2146 I32 scream_pos = -1; /* Internal iterator of scream. */
2147 char *scream_olds = NULL;
2148 const bool utf8_target = cBOOL(DO_UTF8(sv));
2150 RXi_GET_DECL(prog,progi);
2151 regmatch_info reginfo; /* create some info to pass to regtry etc */
2152 regexp_paren_pair *swap = NULL;
2153 GET_RE_DEBUG_FLAGS_DECL;
2155 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2156 PERL_UNUSED_ARG(data);
2158 /* Be paranoid... */
2159 if (prog == NULL || startpos == NULL) {
2160 Perl_croak(aTHX_ "NULL regexp parameter");
2164 multiline = prog->extflags & RXf_PMf_MULTILINE;
2165 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2167 RX_MATCH_UTF8_set(rx, utf8_target);
2169 debug_start_match(rx, utf8_target, startpos, strend,
2173 minlen = prog->minlen;
2175 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2177 "String too short [regexec_flags]...\n"));
2182 /* Check validity of program. */
2183 if (UCHARAT(progi->program) != REG_MAGIC) {
2184 Perl_croak(aTHX_ "corrupted regexp program");
2188 PL_reg_state.re_state_eval_setup_done = FALSE;
2192 PL_reg_flags |= RF_utf8;
2194 /* Mark beginning of line for ^ and lookbehind. */
2195 reginfo.bol = startpos; /* XXX not used ??? */
2199 /* Mark end of line for $ (and such) */
2202 /* see how far we have to get to not match where we matched before */
2203 reginfo.till = startpos+minend;
2205 /* If there is a "must appear" string, look for it. */
2208 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2210 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2211 reginfo.ganch = startpos + prog->gofs;
2212 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2213 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2214 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2216 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2217 && mg->mg_len >= 0) {
2218 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2219 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2220 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2222 if (prog->extflags & RXf_ANCH_GPOS) {
2223 if (s > reginfo.ganch)
2225 s = reginfo.ganch - prog->gofs;
2226 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2227 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2233 reginfo.ganch = strbeg + PTR2UV(data);
2234 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2235 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2237 } else { /* pos() not defined */
2238 reginfo.ganch = strbeg;
2239 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2240 "GPOS: reginfo.ganch = strbeg\n"));
2243 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2244 /* We have to be careful. If the previous successful match
2245 was from this regex we don't want a subsequent partially
2246 successful match to clobber the old results.
2247 So when we detect this possibility we add a swap buffer
2248 to the re, and switch the buffer each match. If we fail
2249 we switch it back, otherwise we leave it swapped.
2252 /* do we need a save destructor here for eval dies? */
2253 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2254 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2255 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2261 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2262 re_scream_pos_data d;
2264 d.scream_olds = &scream_olds;
2265 d.scream_pos = &scream_pos;
2266 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2268 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2269 goto phooey; /* not present */
2275 /* Simplest case: anchored match need be tried only once. */
2276 /* [unless only anchor is BOL and multiline is set] */
2277 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2278 if (s == startpos && regtry(®info, &startpos))
2280 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2281 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2286 dontbother = minlen - 1;
2287 end = HOP3c(strend, -dontbother, strbeg) - 1;
2288 /* for multiline we only have to try after newlines */
2289 if (prog->check_substr || prog->check_utf8) {
2290 /* because of the goto we can not easily reuse the macros for bifurcating the
2291 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2294 goto after_try_utf8;
2296 if (regtry(®info, &s)) {
2303 if (prog->extflags & RXf_USE_INTUIT) {
2304 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2313 } /* end search for check string in unicode */
2315 if (s == startpos) {
2316 goto after_try_latin;
2319 if (regtry(®info, &s)) {
2326 if (prog->extflags & RXf_USE_INTUIT) {
2327 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2336 } /* end search for check string in latin*/
2337 } /* end search for check string */
2338 else { /* search for newline */
2340 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2343 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2344 while (s <= end) { /* note it could be possible to match at the end of the string */
2345 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2346 if (regtry(®info, &s))
2350 } /* end search for newline */
2351 } /* end anchored/multiline check string search */
2353 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2355 /* the warning about reginfo.ganch being used without initialization
2356 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2357 and we only enter this block when the same bit is set. */
2358 char *tmp_s = reginfo.ganch - prog->gofs;
2360 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2365 /* Messy cases: unanchored match. */
2366 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2367 /* we have /x+whatever/ */
2368 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2374 if (! prog->anchored_utf8) {
2375 to_utf8_substr(prog);
2377 ch = SvPVX_const(prog->anchored_utf8)[0];
2380 DEBUG_EXECUTE_r( did_match = 1 );
2381 if (regtry(®info, &s)) goto got_it;
2383 while (s < strend && *s == ch)
2390 if (! prog->anchored_substr) {
2391 if (! to_byte_substr(prog)) {
2392 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2395 ch = SvPVX_const(prog->anchored_substr)[0];
2398 DEBUG_EXECUTE_r( did_match = 1 );
2399 if (regtry(®info, &s)) goto got_it;
2401 while (s < strend && *s == ch)
2406 DEBUG_EXECUTE_r(if (!did_match)
2407 PerlIO_printf(Perl_debug_log,
2408 "Did not find anchored character...\n")
2411 else if (prog->anchored_substr != NULL
2412 || prog->anchored_utf8 != NULL
2413 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2414 && prog->float_max_offset < strend - s)) {
2419 char *last1; /* Last position checked before */
2423 if (prog->anchored_substr || prog->anchored_utf8) {
2425 if (! prog->anchored_utf8) {
2426 to_utf8_substr(prog);
2428 must = prog->anchored_utf8;
2431 if (! prog->anchored_substr) {
2432 if (! to_byte_substr(prog)) {
2433 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2436 must = prog->anchored_substr;
2438 back_max = back_min = prog->anchored_offset;
2441 if (! prog->float_utf8) {
2442 to_utf8_substr(prog);
2444 must = prog->float_utf8;
2447 if (! prog->float_substr) {
2448 if (! to_byte_substr(prog)) {
2449 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2452 must = prog->float_substr;
2454 back_max = prog->float_max_offset;
2455 back_min = prog->float_min_offset;
2461 last = HOP3c(strend, /* Cannot start after this */
2462 -(I32)(CHR_SVLEN(must)
2463 - (SvTAIL(must) != 0) + back_min), strbeg);
2466 last1 = HOPc(s, -1);
2468 last1 = s - 1; /* bogus */
2470 /* XXXX check_substr already used to find "s", can optimize if
2471 check_substr==must. */
2473 dontbother = end_shift;
2474 strend = HOPc(strend, -dontbother);
2475 while ( (s <= last) &&
2476 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2477 (unsigned char*)strend, must,
2478 multiline ? FBMrf_MULTILINE : 0)) ) {
2479 DEBUG_EXECUTE_r( did_match = 1 );
2480 if (HOPc(s, -back_max) > last1) {
2481 last1 = HOPc(s, -back_min);
2482 s = HOPc(s, -back_max);
2485 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2487 last1 = HOPc(s, -back_min);
2491 while (s <= last1) {
2492 if (regtry(®info, &s))
2495 s++; /* to break out of outer loop */
2502 while (s <= last1) {
2503 if (regtry(®info, &s))
2509 DEBUG_EXECUTE_r(if (!did_match) {
2510 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2511 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2512 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2513 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2514 ? "anchored" : "floating"),
2515 quoted, RE_SV_TAIL(must));
2519 else if ( (c = progi->regstclass) ) {
2521 const OPCODE op = OP(progi->regstclass);
2522 /* don't bother with what can't match */
2523 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2524 strend = HOPc(strend, -(minlen - 1));
2527 SV * const prop = sv_newmortal();
2528 regprop(prog, prop, c);
2530 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2532 PerlIO_printf(Perl_debug_log,
2533 "Matching stclass %.*s against %s (%d bytes)\n",
2534 (int)SvCUR(prop), SvPVX_const(prop),
2535 quoted, (int)(strend - s));
2538 if (find_byclass(prog, c, s, strend, ®info))
2540 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2544 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2552 if (! prog->float_utf8) {
2553 to_utf8_substr(prog);
2555 float_real = prog->float_utf8;
2558 if (! prog->float_substr) {
2559 if (! to_byte_substr(prog)) {
2560 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2563 float_real = prog->float_substr;
2566 little = SvPV_const(float_real, len);
2567 if (SvTAIL(float_real)) {
2568 /* This means that float_real contains an artificial \n on
2569 * the end due to the presence of something like this:
2570 * /foo$/ where we can match both "foo" and "foo\n" at the
2571 * end of the string. So we have to compare the end of the
2572 * string first against the float_real without the \n and
2573 * then against the full float_real with the string. We
2574 * have to watch out for cases where the string might be
2575 * smaller than the float_real or the float_real without
2577 char *checkpos= strend - len;
2579 PerlIO_printf(Perl_debug_log,
2580 "%sChecking for float_real.%s\n",
2581 PL_colors[4], PL_colors[5]));
2582 if (checkpos + 1 < strbeg) {
2583 /* can't match, even if we remove the trailing \n
2584 * string is too short to match */
2586 PerlIO_printf(Perl_debug_log,
2587 "%sString shorter than required trailing substring, cannot match.%s\n",
2588 PL_colors[4], PL_colors[5]));
2590 } else if (memEQ(checkpos + 1, little, len - 1)) {
2591 /* can match, the end of the string matches without the
2593 last = checkpos + 1;
2594 } else if (checkpos < strbeg) {
2595 /* cant match, string is too short when the "\n" is
2598 PerlIO_printf(Perl_debug_log,
2599 "%sString does not contain required trailing substring, cannot match.%s\n",
2600 PL_colors[4], PL_colors[5]));
2602 } else if (!multiline) {
2603 /* non multiline match, so compare with the "\n" at the
2604 * end of the string */
2605 if (memEQ(checkpos, little, len)) {
2609 PerlIO_printf(Perl_debug_log,
2610 "%sString does not contain required trailing substring, cannot match.%s\n",
2611 PL_colors[4], PL_colors[5]));
2615 /* multiline match, so we have to search for a place
2616 * where the full string is located */
2622 last = rninstr(s, strend, little, little + len);
2624 last = strend; /* matching "$" */
2627 /* at one point this block contained a comment which was
2628 * probably incorrect, which said that this was a "should not
2629 * happen" case. Even if it was true when it was written I am
2630 * pretty sure it is not anymore, so I have removed the comment
2631 * and replaced it with this one. Yves */
2633 PerlIO_printf(Perl_debug_log,
2634 "String does not contain required substring, cannot match.\n"
2638 dontbother = strend - last + prog->float_min_offset;
2640 if (minlen && (dontbother < minlen))
2641 dontbother = minlen - 1;
2642 strend -= dontbother; /* this one's always in bytes! */
2643 /* We don't know much -- general case. */
2646 if (regtry(®info, &s))
2655 if (regtry(®info, &s))
2657 } while (s++ < strend);
2667 PerlIO_printf(Perl_debug_log,
2668 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2674 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2676 if (PL_reg_state.re_state_eval_setup_done)
2677 restore_pos(aTHX_ prog);
2678 if (RXp_PAREN_NAMES(prog))
2679 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2681 /* make sure $`, $&, $', and $digit will work later */
2682 if ( !(flags & REXEC_NOT_FIRST) ) {
2683 if (flags & REXEC_COPY_STR) {
2687 PerlIO_printf(Perl_debug_log,
2688 "Copy on write: regexp capture, type %d\n",
2691 RX_MATCH_COPY_FREE(rx);
2692 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2693 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2694 assert (SvPOKp(prog->saved_copy));
2695 prog->sublen = PL_regeol - strbeg;
2696 prog->suboffset = 0;
2697 prog->subcoffset = 0;
2702 I32 max = PL_regeol - strbeg;
2705 if ( (flags & REXEC_COPY_SKIP_POST)
2706 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2707 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2708 ) { /* don't copy $' part of string */
2711 /* calculate the right-most part of the string covered
2712 * by a capture. Due to look-ahead, this may be to
2713 * the right of $&, so we have to scan all captures */
2714 while (n <= prog->lastparen) {
2715 if (prog->offs[n].end > max)
2716 max = prog->offs[n].end;
2720 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2721 ? prog->offs[0].start
2723 assert(max >= 0 && max <= PL_regeol - strbeg);
2726 if ( (flags & REXEC_COPY_SKIP_PRE)
2727 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2728 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2729 ) { /* don't copy $` part of string */
2732 /* calculate the left-most part of the string covered
2733 * by a capture. Due to look-behind, this may be to
2734 * the left of $&, so we have to scan all captures */
2735 while (min && n <= prog->lastparen) {
2736 if ( prog->offs[n].start != -1
2737 && prog->offs[n].start < min)
2739 min = prog->offs[n].start;
2743 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2744 && min > prog->offs[0].end
2746 min = prog->offs[0].end;
2750 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2753 if (RX_MATCH_COPIED(rx)) {
2754 if (sublen > prog->sublen)
2756 (char*)saferealloc(prog->subbeg, sublen+1);
2759 prog->subbeg = (char*)safemalloc(sublen+1);
2760 Copy(strbeg + min, prog->subbeg, sublen, char);
2761 prog->subbeg[sublen] = '\0';
2762 prog->suboffset = min;
2763 prog->sublen = sublen;
2764 RX_MATCH_COPIED_on(rx);
2766 prog->subcoffset = prog->suboffset;
2767 if (prog->suboffset && utf8_target) {
2768 /* Convert byte offset to chars.
2769 * XXX ideally should only compute this if @-/@+
2770 * has been seen, a la PL_sawampersand ??? */
2772 /* If there's a direct correspondence between the
2773 * string which we're matching and the original SV,
2774 * then we can use the utf8 len cache associated with
2775 * the SV. In particular, it means that under //g,
2776 * sv_pos_b2u() will use the previously cached
2777 * position to speed up working out the new length of
2778 * subcoffset, rather than counting from the start of
2779 * the string each time. This stops
2780 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2781 * from going quadratic */
2782 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2783 sv_pos_b2u(sv, &(prog->subcoffset));
2785 prog->subcoffset = utf8_length((U8*)strbeg,
2786 (U8*)(strbeg+prog->suboffset));
2790 RX_MATCH_COPY_FREE(rx);
2791 prog->subbeg = strbeg;
2792 prog->suboffset = 0;
2793 prog->subcoffset = 0;
2794 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2801 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2802 PL_colors[4], PL_colors[5]));
2803 if (PL_reg_state.re_state_eval_setup_done)
2804 restore_pos(aTHX_ prog);
2806 /* we failed :-( roll it back */
2807 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2808 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2813 Safefree(prog->offs);
2820 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2821 * Do inc before dec, in case old and new rex are the same */
2822 #define SET_reg_curpm(Re2) \
2823 if (PL_reg_state.re_state_eval_setup_done) { \
2824 (void)ReREFCNT_inc(Re2); \
2825 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2826 PM_SETRE((PL_reg_curpm), (Re2)); \
2831 - regtry - try match at specific point
2833 STATIC I32 /* 0 failure, 1 success */
2834 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2838 REGEXP *const rx = reginfo->prog;
2839 regexp *const prog = ReANY(rx);
2841 RXi_GET_DECL(prog,progi);
2842 GET_RE_DEBUG_FLAGS_DECL;
2844 PERL_ARGS_ASSERT_REGTRY;
2846 reginfo->cutpoint=NULL;
2848 if ((prog->extflags & RXf_EVAL_SEEN)
2849 && !PL_reg_state.re_state_eval_setup_done)
2853 PL_reg_state.re_state_eval_setup_done = TRUE;
2855 /* Make $_ available to executed code. */
2856 if (reginfo->sv != DEFSV) {
2858 DEFSV_set(reginfo->sv);
2861 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2862 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2863 /* prepare for quick setting of pos */
2864 #ifdef PERL_OLD_COPY_ON_WRITE
2865 if (SvIsCOW(reginfo->sv))
2866 sv_force_normal_flags(reginfo->sv, 0);
2868 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2869 &PL_vtbl_mglob, NULL, 0);
2873 PL_reg_oldpos = mg->mg_len;
2874 SAVEDESTRUCTOR_X(restore_pos, prog);
2876 if (!PL_reg_curpm) {
2877 Newxz(PL_reg_curpm, 1, PMOP);
2880 SV* const repointer = &PL_sv_undef;
2881 /* this regexp is also owned by the new PL_reg_curpm, which
2882 will try to free it. */
2883 av_push(PL_regex_padav, repointer);
2884 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2885 PL_regex_pad = AvARRAY(PL_regex_padav);
2890 PL_reg_oldcurpm = PL_curpm;
2891 PL_curpm = PL_reg_curpm;
2892 if (RXp_MATCH_COPIED(prog)) {
2893 /* Here is a serious problem: we cannot rewrite subbeg,
2894 since it may be needed if this match fails. Thus
2895 $` inside (?{}) could fail... */
2896 PL_reg_oldsaved = prog->subbeg;
2897 PL_reg_oldsavedlen = prog->sublen;
2898 PL_reg_oldsavedoffset = prog->suboffset;
2899 PL_reg_oldsavedcoffset = prog->suboffset;
2901 PL_nrs = prog->saved_copy;
2903 RXp_MATCH_COPIED_off(prog);
2906 PL_reg_oldsaved = NULL;
2907 prog->subbeg = PL_bostr;
2908 prog->suboffset = 0;
2909 prog->subcoffset = 0;
2910 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2913 PL_reg_starttry = *startposp;
2915 prog->offs[0].start = *startposp - PL_bostr;
2916 prog->lastparen = 0;
2917 prog->lastcloseparen = 0;
2919 /* XXXX What this code is doing here?!!! There should be no need
2920 to do this again and again, prog->lastparen should take care of
2923 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2924 * Actually, the code in regcppop() (which Ilya may be meaning by
2925 * prog->lastparen), is not needed at all by the test suite
2926 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2927 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2928 * Meanwhile, this code *is* needed for the
2929 * above-mentioned test suite tests to succeed. The common theme
2930 * on those tests seems to be returning null fields from matches.
2931 * --jhi updated by dapm */
2933 if (prog->nparens) {
2934 regexp_paren_pair *pp = prog->offs;
2936 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2944 result = regmatch(reginfo, *startposp, progi->program + 1);
2946 prog->offs[0].end = result;
2949 if (reginfo->cutpoint)
2950 *startposp= reginfo->cutpoint;
2951 REGCP_UNWIND(lastcp);
2956 #define sayYES goto yes
2957 #define sayNO goto no
2958 #define sayNO_SILENT goto no_silent
2960 /* we dont use STMT_START/END here because it leads to
2961 "unreachable code" warnings, which are bogus, but distracting. */
2962 #define CACHEsayNO \
2963 if (ST.cache_mask) \
2964 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2967 /* this is used to determine how far from the left messages like
2968 'failed...' are printed. It should be set such that messages
2969 are inline with the regop output that created them.
2971 #define REPORT_CODE_OFF 32
2974 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2975 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2976 #define CHRTEST_NOT_A_CP_1 -999
2977 #define CHRTEST_NOT_A_CP_2 -998
2979 #define SLAB_FIRST(s) (&(s)->states[0])
2980 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2982 /* grab a new slab and return the first slot in it */
2984 STATIC regmatch_state *
2987 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2990 regmatch_slab *s = PL_regmatch_slab->next;
2992 Newx(s, 1, regmatch_slab);
2993 s->prev = PL_regmatch_slab;
2995 PL_regmatch_slab->next = s;
2997 PL_regmatch_slab = s;
2998 return SLAB_FIRST(s);
3002 /* push a new state then goto it */
3004 #define PUSH_STATE_GOTO(state, node, input) \
3005 pushinput = input; \
3007 st->resume_state = state; \
3010 /* push a new state with success backtracking, then goto it */
3012 #define PUSH_YES_STATE_GOTO(state, node, input) \
3013 pushinput = input; \
3015 st->resume_state = state; \
3016 goto push_yes_state;
3023 regmatch() - main matching routine
3025 This is basically one big switch statement in a loop. We execute an op,
3026 set 'next' to point the next op, and continue. If we come to a point which
3027 we may need to backtrack to on failure such as (A|B|C), we push a
3028 backtrack state onto the backtrack stack. On failure, we pop the top
3029 state, and re-enter the loop at the state indicated. If there are no more
3030 states to pop, we return failure.
3032 Sometimes we also need to backtrack on success; for example /A+/, where
3033 after successfully matching one A, we need to go back and try to
3034 match another one; similarly for lookahead assertions: if the assertion
3035 completes successfully, we backtrack to the state just before the assertion
3036 and then carry on. In these cases, the pushed state is marked as
3037 'backtrack on success too'. This marking is in fact done by a chain of
3038 pointers, each pointing to the previous 'yes' state. On success, we pop to
3039 the nearest yes state, discarding any intermediate failure-only states.
3040 Sometimes a yes state is pushed just to force some cleanup code to be
3041 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3042 it to free the inner regex.
3044 Note that failure backtracking rewinds the cursor position, while
3045 success backtracking leaves it alone.
3047 A pattern is complete when the END op is executed, while a subpattern
3048 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3049 ops trigger the "pop to last yes state if any, otherwise return true"
3052 A common convention in this function is to use A and B to refer to the two
3053 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3054 the subpattern to be matched possibly multiple times, while B is the entire
3055 rest of the pattern. Variable and state names reflect this convention.
3057 The states in the main switch are the union of ops and failure/success of
3058 substates associated with with that op. For example, IFMATCH is the op
3059 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3060 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3061 successfully matched A and IFMATCH_A_fail is a state saying that we have
3062 just failed to match A. Resume states always come in pairs. The backtrack
3063 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3064 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3065 on success or failure.
3067 The struct that holds a backtracking state is actually a big union, with
3068 one variant for each major type of op. The variable st points to the
3069 top-most backtrack struct. To make the code clearer, within each
3070 block of code we #define ST to alias the relevant union.
3072 Here's a concrete example of a (vastly oversimplified) IFMATCH
3078 #define ST st->u.ifmatch
3080 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3081 ST.foo = ...; // some state we wish to save
3083 // push a yes backtrack state with a resume value of
3084 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3086 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3089 case IFMATCH_A: // we have successfully executed A; now continue with B
3091 bar = ST.foo; // do something with the preserved value
3094 case IFMATCH_A_fail: // A failed, so the assertion failed
3095 ...; // do some housekeeping, then ...
3096 sayNO; // propagate the failure
3103 For any old-timers reading this who are familiar with the old recursive
3104 approach, the code above is equivalent to:
3106 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3115 ...; // do some housekeeping, then ...
3116 sayNO; // propagate the failure
3119 The topmost backtrack state, pointed to by st, is usually free. If you
3120 want to claim it, populate any ST.foo fields in it with values you wish to
3121 save, then do one of
3123 PUSH_STATE_GOTO(resume_state, node, newinput);
3124 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3126 which sets that backtrack state's resume value to 'resume_state', pushes a
3127 new free entry to the top of the backtrack stack, then goes to 'node'.
3128 On backtracking, the free slot is popped, and the saved state becomes the
3129 new free state. An ST.foo field in this new top state can be temporarily
3130 accessed to retrieve values, but once the main loop is re-entered, it
3131 becomes available for reuse.
3133 Note that the depth of the backtrack stack constantly increases during the
3134 left-to-right execution of the pattern, rather than going up and down with
3135 the pattern nesting. For example the stack is at its maximum at Z at the
3136 end of the pattern, rather than at X in the following:
3138 /(((X)+)+)+....(Y)+....Z/
3140 The only exceptions to this are lookahead/behind assertions and the cut,
3141 (?>A), which pop all the backtrack states associated with A before
3144 Backtrack state structs are allocated in slabs of about 4K in size.
3145 PL_regmatch_state and st always point to the currently active state,
3146 and PL_regmatch_slab points to the slab currently containing
3147 PL_regmatch_state. The first time regmatch() is called, the first slab is
3148 allocated, and is never freed until interpreter destruction. When the slab
3149 is full, a new one is allocated and chained to the end. At exit from
3150 regmatch(), slabs allocated since entry are freed.
3155 #define DEBUG_STATE_pp(pp) \
3157 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3158 PerlIO_printf(Perl_debug_log, \
3159 " %*s"pp" %s%s%s%s%s\n", \
3161 PL_reg_name[st->resume_state], \
3162 ((st==yes_state||st==mark_state) ? "[" : ""), \
3163 ((st==yes_state) ? "Y" : ""), \
3164 ((st==mark_state) ? "M" : ""), \
3165 ((st==yes_state||st==mark_state) ? "]" : "") \
3170 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3175 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3176 const char *start, const char *end, const char *blurb)
3178 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3180 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3185 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3186 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3188 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3189 start, end - start, 60);
3191 PerlIO_printf(Perl_debug_log,
3192 "%s%s REx%s %s against %s\n",
3193 PL_colors[4], blurb, PL_colors[5], s0, s1);
3195 if (utf8_target||utf8_pat)
3196 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3197 utf8_pat ? "pattern" : "",
3198 utf8_pat && utf8_target ? " and " : "",
3199 utf8_target ? "string" : ""
3205 S_dump_exec_pos(pTHX_ const char *locinput,
3206 const regnode *scan,
3207 const char *loc_regeol,
3208 const char *loc_bostr,
3209 const char *loc_reg_starttry,
3210 const bool utf8_target)
3212 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3213 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3214 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3215 /* The part of the string before starttry has one color
3216 (pref0_len chars), between starttry and current
3217 position another one (pref_len - pref0_len chars),
3218 after the current position the third one.
3219 We assume that pref0_len <= pref_len, otherwise we
3220 decrease pref0_len. */
3221 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3222 ? (5 + taill) - l : locinput - loc_bostr;
3225 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3227 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3229 pref0_len = pref_len - (locinput - loc_reg_starttry);
3230 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3231 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3232 ? (5 + taill) - pref_len : loc_regeol - locinput);
3233 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3237 if (pref0_len > pref_len)
3238 pref0_len = pref_len;
3240 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3242 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3243 (locinput - pref_len),pref0_len, 60, 4, 5);
3245 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3246 (locinput - pref_len + pref0_len),
3247 pref_len - pref0_len, 60, 2, 3);
3249 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3250 locinput, loc_regeol - locinput, 10, 0, 1);
3252 const STRLEN tlen=len0+len1+len2;
3253 PerlIO_printf(Perl_debug_log,
3254 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3255 (IV)(locinput - loc_bostr),
3258 (docolor ? "" : "> <"),
3260 (int)(tlen > 19 ? 0 : 19 - tlen),
3267 /* reg_check_named_buff_matched()
3268 * Checks to see if a named buffer has matched. The data array of
3269 * buffer numbers corresponding to the buffer is expected to reside
3270 * in the regexp->data->data array in the slot stored in the ARG() of
3271 * node involved. Note that this routine doesn't actually care about the
3272 * name, that information is not preserved from compilation to execution.
3273 * Returns the index of the leftmost defined buffer with the given name
3274 * or 0 if non of the buffers matched.
3277 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3280 RXi_GET_DECL(rex,rexi);
3281 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3282 I32 *nums=(I32*)SvPVX(sv_dat);
3284 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3286 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3287 if ((I32)rex->lastparen >= nums[n] &&
3288 rex->offs[nums[n]].end != -1)
3297 /* free all slabs above current one - called during LEAVE_SCOPE */
3300 S_clear_backtrack_stack(pTHX_ void *p)
3302 regmatch_slab *s = PL_regmatch_slab->next;
3307 PL_regmatch_slab->next = NULL;
3309 regmatch_slab * const osl = s;
3315 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3317 /* This function determines if there are one or two characters that match
3318 * the first character of the passed-in EXACTish node <text_node>, and if
3319 * so, returns them in the passed-in pointers.
3321 * If it determines that no possible character in the target string can
3322 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3323 * the first character in <text_node> requires UTF-8 to represent, and the
3324 * target string isn't in UTF-8.)
3326 * If there are more than two characters that could match the beginning of
3327 * <text_node>, or if more context is required to determine a match or not,
3328 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3330 * The motiviation behind this function is to allow the caller to set up
3331 * tight loops for matching. If <text_node> is of type EXACT, there is
3332 * only one possible character that can match its first character, and so
3333 * the situation is quite simple. But things get much more complicated if
3334 * folding is involved. It may be that the first character of an EXACTFish
3335 * node doesn't participate in any possible fold, e.g., punctuation, so it
3336 * can be matched only by itself. The vast majority of characters that are
3337 * in folds match just two things, their lower and upper-case equivalents.
3338 * But not all are like that; some have multiple possible matches, or match
3339 * sequences of more than one character. This function sorts all that out.
3341 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3342 * loop of trying to match A*, we know we can't exit where the thing
3343 * following it isn't a B. And something can't be a B unless it is the
3344 * beginning of B. By putting a quick test for that beginning in a tight
3345 * loop, we can rule out things that can't possibly be B without having to
3346 * break out of the loop, thus avoiding work. Similarly, if A is a single
3347 * character, we can make a tight loop matching A*, using the outputs of
3350 * If the target string to match isn't in UTF-8, and there aren't
3351 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3352 * the one or two possible octets (which are characters in this situation)
3353 * that can match. In all cases, if there is only one character that can
3354 * match, *<c1p> and *<c2p> will be identical.
3356 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3357 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3358 * can match the beginning of <text_node>. They should be declared with at
3359 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3360 * undefined what these contain.) If one or both of the buffers are
3361 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3362 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3363 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3364 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3365 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3367 const bool utf8_target = PL_reg_match_utf8;
3369 UV c1 = CHRTEST_NOT_A_CP_1;
3370 UV c2 = CHRTEST_NOT_A_CP_2;
3371 bool use_chrtest_void = FALSE;
3373 /* Used when we have both utf8 input and utf8 output, to avoid converting
3374 * to/from code points */
3375 bool utf8_has_been_setup = FALSE;
3379 U8 *pat = (U8*)STRING(text_node);
3381 if (OP(text_node) == EXACT) {
3383 /* In an exact node, only one thing can be matched, that first
3384 * character. If both the pat and the target are UTF-8, we can just
3385 * copy the input to the output, avoiding finding the code point of
3387 if (! UTF_PATTERN) {
3390 else if (utf8_target) {
3391 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3392 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3393 utf8_has_been_setup = TRUE;
3396 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3399 else /* an EXACTFish node */
3401 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3402 pat + STR_LEN(text_node)))
3404 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3405 pat + STR_LEN(text_node))))
3407 /* Multi-character folds require more context to sort out. Also
3408 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3409 * handled outside this routine */
3410 use_chrtest_void = TRUE;
3412 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3413 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3415 /* Load the folds hash, if not already done */
3417 if (! PL_utf8_foldclosures) {
3418 if (! PL_utf8_tofold) {
3419 U8 dummy[UTF8_MAXBYTES+1];
3421 /* Force loading this by folding an above-Latin1 char */
3422 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3423 assert(PL_utf8_tofold); /* Verify that worked */
3425 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3428 /* The fold closures data structure is a hash with the keys being
3429 * the UTF-8 of every character that is folded to, like 'k', and
3430 * the values each an array of all code points that fold to its
3431 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3433 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3438 /* Not found in the hash, therefore there are no folds
3439 * containing it, so there is only a single character that
3443 else { /* Does participate in folds */
3444 AV* list = (AV*) *listp;
3445 if (av_len(list) != 1) {
3447 /* If there aren't exactly two folds to this, it is outside
3448 * the scope of this function */
3449 use_chrtest_void = TRUE;
3451 else { /* There are two. Get them */
3452 SV** c_p = av_fetch(list, 0, FALSE);
3454 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3458 c_p = av_fetch(list, 1, FALSE);
3460 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3464 /* Folds that cross the 255/256 boundary are forbidden if
3465 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3466 * pattern character is above 256, and its only other match
3467 * is below 256, the only legal match will be to itself.
3468 * We have thrown away the original, so have to compute
3469 * which is the one above 255 */
3470 if ((c1 < 256) != (c2 < 256)) {
3471 if (OP(text_node) == EXACTFL
3472 || (OP(text_node) == EXACTFA
3473 && (isASCII(c1) || isASCII(c2))))
3486 else /* Here, c1 is < 255 */
3488 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3489 && OP(text_node) != EXACTFL
3490 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3492 /* Here, there could be something above Latin1 in the target which
3493 * folds to this character in the pattern. All such cases except
3494 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3495 * involved in their folds, so are outside the scope of this
3497 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3498 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3501 use_chrtest_void = TRUE;
3504 else { /* Here nothing above Latin1 can fold to the pattern character */
3505 switch (OP(text_node)) {
3507 case EXACTFL: /* /l rules */
3508 c2 = PL_fold_locale[c1];
3512 if (! utf8_target) { /* /d rules */
3517 /* /u rules for all these. This happens to work for
3518 * EXACTFA as nothing in Latin1 folds to ASCII */
3520 case EXACTFU_TRICKYFOLD:
3523 c2 = PL_fold_latin1[c1];
3527 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3528 assert(0); /* NOTREACHED */
3533 /* Here have figured things out. Set up the returns */
3534 if (use_chrtest_void) {
3535 *c2p = *c1p = CHRTEST_VOID;
3537 else if (utf8_target) {
3538 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3539 uvchr_to_utf8(c1_utf8, c1);
3540 uvchr_to_utf8(c2_utf8, c2);
3543 /* Invariants are stored in both the utf8 and byte outputs; Use
3544 * negative numbers otherwise for the byte ones. Make sure that the
3545 * byte ones are the same iff the utf8 ones are the same */
3546 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3547 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3550 ? CHRTEST_NOT_A_CP_1
3551 : CHRTEST_NOT_A_CP_2;
3553 else if (c1 > 255) {
3554 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3559 *c1p = *c2p = c2; /* c2 is the only representable value */
3561 else { /* c1 is representable; see about c2 */
3563 *c2p = (c2 < 256) ? c2 : c1;
3569 /* returns -1 on failure, $+[0] on success */
3571 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3573 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3577 const bool utf8_target = PL_reg_match_utf8;
3578 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3579 REGEXP *rex_sv = reginfo->prog;
3580 regexp *rex = ReANY(rex_sv);
3581 RXi_GET_DECL(rex,rexi);
3583 /* the current state. This is a cached copy of PL_regmatch_state */
3585 /* cache heavy used fields of st in registers */
3588 U32 n = 0; /* general value; init to avoid compiler warning */
3589 I32 ln = 0; /* len or last; init to avoid compiler warning */
3590 char *locinput = startpos;
3591 char *pushinput; /* where to continue after a PUSH */
3592 I32 nextchr; /* is always set to UCHARAT(locinput) */
3594 bool result = 0; /* return value of S_regmatch */
3595 int depth = 0; /* depth of backtrack stack */
3596 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3597 const U32 max_nochange_depth =
3598 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3599 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3600 regmatch_state *yes_state = NULL; /* state to pop to on success of
3602 /* mark_state piggy backs on the yes_state logic so that when we unwind
3603 the stack on success we can update the mark_state as we go */
3604 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3605 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3606 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3608 bool no_final = 0; /* prevent failure from backtracking? */
3609 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3610 char *startpoint = locinput;
3611 SV *popmark = NULL; /* are we looking for a mark? */
3612 SV *sv_commit = NULL; /* last mark name seen in failure */
3613 SV *sv_yes_mark = NULL; /* last mark name we have seen
3614 during a successful match */
3615 U32 lastopen = 0; /* last open we saw */
3616 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3617 SV* const oreplsv = GvSV(PL_replgv);
3618 /* these three flags are set by various ops to signal information to
3619 * the very next op. They have a useful lifetime of exactly one loop
3620 * iteration, and are not preserved or restored by state pushes/pops
3622 bool sw = 0; /* the condition value in (?(cond)a|b) */
3623 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3624 int logical = 0; /* the following EVAL is:
3628 or the following IFMATCH/UNLESSM is:
3629 false: plain (?=foo)
3630 true: used as a condition: (?(?=foo))
3632 PAD* last_pad = NULL;
3634 I32 gimme = G_SCALAR;
3635 CV *caller_cv = NULL; /* who called us */
3636 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3637 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3638 U32 maxopenparen = 0; /* max '(' index seen so far */
3641 GET_RE_DEBUG_FLAGS_DECL;
3644 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3645 multicall_oldcatch = 0;
3646 multicall_cv = NULL;
3648 PERL_UNUSED_VAR(multicall_cop);
3649 PERL_UNUSED_VAR(newsp);
3652 PERL_ARGS_ASSERT_REGMATCH;
3654 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3655 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3657 /* on first ever call to regmatch, allocate first slab */
3658 if (!PL_regmatch_slab) {
3659 Newx(PL_regmatch_slab, 1, regmatch_slab);
3660 PL_regmatch_slab->prev = NULL;
3661 PL_regmatch_slab->next = NULL;
3662 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3665 oldsave = PL_savestack_ix;
3666 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3667 SAVEVPTR(PL_regmatch_slab);
3668 SAVEVPTR(PL_regmatch_state);
3670 /* grab next free state slot */
3671 st = ++PL_regmatch_state;
3672 if (st > SLAB_LAST(PL_regmatch_slab))
3673 st = PL_regmatch_state = S_push_slab(aTHX);
3675 /* Note that nextchr is a byte even in UTF */
3678 while (scan != NULL) {
3681 SV * const prop = sv_newmortal();
3682 regnode *rnext=regnext(scan);
3683 DUMP_EXEC_POS( locinput, scan, utf8_target );
3684 regprop(rex, prop, scan);
3686 PerlIO_printf(Perl_debug_log,
3687 "%3"IVdf":%*s%s(%"IVdf")\n",
3688 (IV)(scan - rexi->program), depth*2, "",
3690 (PL_regkind[OP(scan)] == END || !rnext) ?
3691 0 : (IV)(rnext - rexi->program));
3694 next = scan + NEXT_OFF(scan);
3697 state_num = OP(scan);
3699 REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3703 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3705 switch (state_num) {
3706 case BOL: /* /^../ */
3707 if (locinput == PL_bostr)
3709 /* reginfo->till = reginfo->bol; */
3714 case MBOL: /* /^../m */
3715 if (locinput == PL_bostr ||
3716 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3722 case SBOL: /* /^../s */
3723 if (locinput == PL_bostr)
3728 if (locinput == reginfo->ganch)
3732 case KEEPS: /* \K */
3733 /* update the startpoint */
3734 st->u.keeper.val = rex->offs[0].start;
3735 rex->offs[0].start = locinput - PL_bostr;
3736 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3737 assert(0); /*NOTREACHED*/
3738 case KEEPS_next_fail:
3739 /* rollback the start point change */
3740 rex->offs[0].start = st->u.keeper.val;
3742 assert(0); /*NOTREACHED*/
3744 case EOL: /* /..$/ */
3747 case MEOL: /* /..$/m */
3748 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3752 case SEOL: /* /..$/s */
3754 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3756 if (PL_regeol - locinput > 1)
3761 if (!NEXTCHR_IS_EOS)
3765 case SANY: /* /./s */
3768 goto increment_locinput;
3776 case REG_ANY: /* /./ */
3777 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3779 goto increment_locinput;
3783 #define ST st->u.trie
3784 case TRIEC: /* (ab|cd) with known charclass */
3785 /* In this case the charclass data is available inline so
3786 we can fail fast without a lot of extra overhead.
3788 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3790 PerlIO_printf(Perl_debug_log,
3791 "%*s %sfailed to match trie start class...%s\n",
3792 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3795 assert(0); /* NOTREACHED */
3798 case TRIE: /* (ab|cd) */
3799 /* the basic plan of execution of the trie is:
3800 * At the beginning, run though all the states, and
3801 * find the longest-matching word. Also remember the position
3802 * of the shortest matching word. For example, this pattern:
3805 * when matched against the string "abcde", will generate
3806 * accept states for all words except 3, with the longest
3807 * matching word being 4, and the shortest being 2 (with
3808 * the position being after char 1 of the string).
3810 * Then for each matching word, in word order (i.e. 1,2,4,5),
3811 * we run the remainder of the pattern; on each try setting
3812 * the current position to the character following the word,
3813 * returning to try the next word on failure.
3815 * We avoid having to build a list of words at runtime by
3816 * using a compile-time structure, wordinfo[].prev, which
3817 * gives, for each word, the previous accepting word (if any).
3818 * In the case above it would contain the mappings 1->2, 2->0,
3819 * 3->0, 4->5, 5->1. We can use this table to generate, from
3820 * the longest word (4 above), a list of all words, by
3821 * following the list of prev pointers; this gives us the
3822 * unordered list 4,5,1,2. Then given the current word we have
3823 * just tried, we can go through the list and find the
3824 * next-biggest word to try (so if we just failed on word 2,
3825 * the next in the list is 4).
3827 * Since at runtime we don't record the matching position in
3828 * the string for each word, we have to work that out for
3829 * each word we're about to process. The wordinfo table holds
3830 * the character length of each word; given that we recorded
3831 * at the start: the position of the shortest word and its
3832 * length in chars, we just need to move the pointer the
3833 * difference between the two char lengths. Depending on
3834 * Unicode status and folding, that's cheap or expensive.
3836 * This algorithm is optimised for the case where are only a
3837 * small number of accept states, i.e. 0,1, or maybe 2.
3838 * With lots of accepts states, and having to try all of them,
3839 * it becomes quadratic on number of accept states to find all
3844 /* what type of TRIE am I? (utf8 makes this contextual) */
3845 DECL_TRIE_TYPE(scan);
3847 /* what trie are we using right now */
3848 reg_trie_data * const trie
3849 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3850 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3851 U32 state = trie->startstate;
3854 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3856 if (trie->states[ state ].wordnum) {
3858 PerlIO_printf(Perl_debug_log,
3859 "%*s %smatched empty string...%s\n",
3860 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3866 PerlIO_printf(Perl_debug_log,
3867 "%*s %sfailed to match trie start class...%s\n",
3868 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3875 U8 *uc = ( U8* )locinput;
3879 U8 *uscan = (U8*)NULL;
3880 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3881 U32 charcount = 0; /* how many input chars we have matched */
3882 U32 accepted = 0; /* have we seen any accepting states? */
3884 ST.jump = trie->jump;
3887 ST.longfold = FALSE; /* char longer if folded => it's harder */
3890 /* fully traverse the TRIE; note the position of the
3891 shortest accept state and the wordnum of the longest
3894 while ( state && uc <= (U8*)PL_regeol ) {
3895 U32 base = trie->states[ state ].trans.base;
3899 wordnum = trie->states[ state ].wordnum;
3901 if (wordnum) { /* it's an accept state */
3904 /* record first match position */
3906 ST.firstpos = (U8*)locinput;
3911 ST.firstchars = charcount;
3914 if (!ST.nextword || wordnum < ST.nextword)
3915 ST.nextword = wordnum;
3916 ST.topword = wordnum;
3919 DEBUG_TRIE_EXECUTE_r({
3920 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3921 PerlIO_printf( Perl_debug_log,
3922 "%*s %sState: %4"UVxf" Accepted: %c ",
3923 2+depth * 2, "", PL_colors[4],
3924 (UV)state, (accepted ? 'Y' : 'N'));
3927 /* read a char and goto next state */
3928 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3930 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3931 uscan, len, uvc, charid, foldlen,
3938 base + charid - 1 - trie->uniquecharcount)) >= 0)
3940 && ((U32)offset < trie->lasttrans)
3941 && trie->trans[offset].check == state)
3943 state = trie->trans[offset].next;
3954 DEBUG_TRIE_EXECUTE_r(
3955 PerlIO_printf( Perl_debug_log,
3956 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3957 charid, uvc, (UV)state, PL_colors[5] );
3963 /* calculate total number of accept states */
3968 w = trie->wordinfo[w].prev;
3971 ST.accepted = accepted;
3975 PerlIO_printf( Perl_debug_log,
3976 "%*s %sgot %"IVdf" possible matches%s\n",
3977 REPORT_CODE_OFF + depth * 2, "",
3978 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3980 goto trie_first_try; /* jump into the fail handler */
3982 assert(0); /* NOTREACHED */
3984 case TRIE_next_fail: /* we failed - try next alternative */
3988 REGCP_UNWIND(ST.cp);
3989 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3991 if (!--ST.accepted) {
3993 PerlIO_printf( Perl_debug_log,
3994 "%*s %sTRIE failed...%s\n",
3995 REPORT_CODE_OFF+depth*2, "",
4002 /* Find next-highest word to process. Note that this code
4003 * is O(N^2) per trie run (O(N) per branch), so keep tight */
4006 U16 const nextword = ST.nextword;
4007 reg_trie_wordinfo * const wordinfo
4008 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4009 for (word=ST.topword; word; word=wordinfo[word].prev) {
4010 if (word > nextword && (!min || word < min))
4023 ST.lastparen = rex->lastparen;
4024 ST.lastcloseparen = rex->lastcloseparen;
4028 /* find start char of end of current word */
4030 U32 chars; /* how many chars to skip */
4031 reg_trie_data * const trie
4032 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4034 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4036 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4041 /* the hard option - fold each char in turn and find
4042 * its folded length (which may be different */
4043 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4051 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4059 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4064 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4080 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4081 ? ST.jump[ST.nextword]
4085 PerlIO_printf( Perl_debug_log,
4086 "%*s %sTRIE matched word #%d, continuing%s\n",
4087 REPORT_CODE_OFF+depth*2, "",
4094 if (ST.accepted > 1 || has_cutgroup) {
4095 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4096 assert(0); /* NOTREACHED */
4098 /* only one choice left - just continue */
4100 AV *const trie_words
4101 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4102 SV ** const tmp = av_fetch( trie_words,
4104 SV *sv= tmp ? sv_newmortal() : NULL;
4106 PerlIO_printf( Perl_debug_log,
4107 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4108 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4110 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4111 PL_colors[0], PL_colors[1],
4112 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4114 : "not compiled under -Dr",
4118 locinput = (char*)uc;
4119 continue; /* execute rest of RE */
4120 assert(0); /* NOTREACHED */
4124 case EXACT: { /* /abc/ */
4125 char *s = STRING(scan);
4127 if (utf8_target != UTF_PATTERN) {
4128 /* The target and the pattern have differing utf8ness. */
4130 const char * const e = s + ln;
4133 /* The target is utf8, the pattern is not utf8.
4134 * Above-Latin1 code points can't match the pattern;
4135 * invariants match exactly, and the other Latin1 ones need
4136 * to be downgraded to a single byte in order to do the
4137 * comparison. (If we could be confident that the target
4138 * is not malformed, this could be refactored to have fewer
4139 * tests by just assuming that if the first bytes match, it
4140 * is an invariant, but there are tests in the test suite
4141 * dealing with (??{...}) which violate this) */
4145 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4148 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4155 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4164 /* The target is not utf8, the pattern is utf8. */
4166 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4170 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4177 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4188 /* The target and the pattern have the same utf8ness. */
4189 /* Inline the first character, for speed. */
4190 if (UCHARAT(s) != nextchr)
4192 if (PL_regeol - locinput < ln)
4194 if (ln > 1 && memNE(s, locinput, ln))
4200 case EXACTFL: { /* /abc/il */
4202 const U8 * fold_array;
4204 U32 fold_utf8_flags;
4206 PL_reg_flags |= RF_tainted;
4207 folder = foldEQ_locale;
4208 fold_array = PL_fold_locale;
4209 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4212 case EXACTFU_SS: /* /\x{df}/iu */
4213 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4214 case EXACTFU: /* /abc/iu */
4215 folder = foldEQ_latin1;
4216 fold_array = PL_fold_latin1;
4217 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4220 case EXACTFA: /* /abc/iaa */
4221 folder = foldEQ_latin1;
4222 fold_array = PL_fold_latin1;
4223 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4226 case EXACTF: /* /abc/i */
4228 fold_array = PL_fold;
4229 fold_utf8_flags = 0;
4235 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4236 /* Either target or the pattern are utf8, or has the issue where
4237 * the fold lengths may differ. */
4238 const char * const l = locinput;
4239 char *e = PL_regeol;
4241 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
4242 l, &e, 0, utf8_target, fold_utf8_flags))
4250 /* Neither the target nor the pattern are utf8 */
4251 if (UCHARAT(s) != nextchr
4253 && UCHARAT(s) != fold_array[nextchr])
4257 if (PL_regeol - locinput < ln)
4259 if (ln > 1 && ! folder(s, locinput, ln))
4265 /* XXX Could improve efficiency by separating these all out using a
4266 * macro or in-line function. At that point regcomp.c would no longer
4267 * have to set the FLAGS fields of these */
4268 case BOUNDL: /* /\b/l */
4269 case NBOUNDL: /* /\B/l */
4270 PL_reg_flags |= RF_tainted;
4272 case BOUND: /* /\b/ */
4273 case BOUNDU: /* /\b/u */
4274 case BOUNDA: /* /\b/a */
4275 case NBOUND: /* /\B/ */
4276 case NBOUNDU: /* /\B/u */
4277 case NBOUNDA: /* /\B/a */
4278 /* was last char in word? */
4280 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4281 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4283 if (locinput == PL_bostr)
4286 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4288 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4290 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4291 ln = isALNUM_uni(ln);
4295 LOAD_UTF8_CHARCLASS_ALNUM();
4296 n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4301 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4302 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4307 /* Here the string isn't utf8, or is utf8 and only ascii
4308 * characters are to match \w. In the latter case looking at
4309 * the byte just prior to the current one may be just the final
4310 * byte of a multi-byte character. This is ok. There are two
4312 * 1) it is a single byte character, and then the test is doing
4313 * just what it's supposed to.
4314 * 2) it is a multi-byte character, in which case the final
4315 * byte is never mistakable for ASCII, and so the test
4316 * will say it is not a word character, which is the
4317 * correct answer. */
4318 ln = (locinput != PL_bostr) ?
4319 UCHARAT(locinput - 1) : '\n';
4320 switch (FLAGS(scan)) {
4321 case REGEX_UNICODE_CHARSET:
4322 ln = isWORDCHAR_L1(ln);
4323 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4325 case REGEX_LOCALE_CHARSET:
4326 ln = isALNUM_LC(ln);
4327 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4329 case REGEX_DEPENDS_CHARSET:
4331 n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4333 case REGEX_ASCII_RESTRICTED_CHARSET:
4334 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4335 ln = isWORDCHAR_A(ln);
4336 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4339 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4343 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4345 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4349 case ANYOF: /* /[abc]/ */
4353 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4355 locinput += UTF8SKIP(locinput);
4359 if (!REGINCLASS(rex, scan, (U8*)locinput))
4366 /* Special char classes: \d, \w etc.
4367 * The defines start on line 166 or so */
4368 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
4369 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4370 ALNUMU, NALNUMU, isWORDCHAR_L1,
4371 ALNUMA, NALNUMA, isWORDCHAR_A,
4375 PL_reg_flags |= RF_tainted;
4376 if (NEXTCHR_IS_EOS) {
4379 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4380 if (! isSPACE_LC_utf8((U8 *) locinput)) {
4384 else if (! isSPACE_LC((U8) nextchr)) {
4387 goto increment_locinput;
4390 PL_reg_flags |= RF_tainted;
4391 if (NEXTCHR_IS_EOS) {
4394 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4395 if (isSPACE_LC_utf8((U8 *) locinput)) {
4399 else if (isSPACE_LC(nextchr)) {
4402 goto increment_locinput;
4410 if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
4413 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4423 if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
4426 goto increment_locinput;
4430 if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
4433 goto increment_locinput;
4437 if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
4440 goto increment_locinput;
4442 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4443 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4444 DIGITA, NDIGITA, isDIGIT_A,
4447 case POSIXA: /* /[[:ascii:]]/ etc */
4448 if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4451 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4455 case NPOSIXA: /* /[^[:ascii:]]/ etc */
4456 if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4459 goto increment_locinput;
4461 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4462 a Unicode extended Grapheme Cluster */
4463 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4464 extended Grapheme Cluster is:
4467 | Prepend* Begin Extend*
4470 Begin is: ( Special_Begin | ! Control )
4471 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4472 Extend is: ( Grapheme_Extend | Spacing_Mark )
4473 Control is: [ GCB_Control | CR | LF ]
4474 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4476 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4479 Begin is ( Regular_Begin + Special Begin )
4481 It turns out that 98.4% of all Unicode code points match
4482 Regular_Begin. Doing it this way eliminates a table match in
4483 the previous implementation for almost all Unicode code points.
4485 There is a subtlety with Prepend* which showed up in testing.
4486 Note that the Begin, and only the Begin is required in:
4487 | Prepend* Begin Extend*
4488 Also, Begin contains '! Control'. A Prepend must be a
4489 '! Control', which means it must also be a Begin. What it
4490 comes down to is that if we match Prepend* and then find no
4491 suitable Begin afterwards, that if we backtrack the last
4492 Prepend, that one will be a suitable Begin.
4497 if (! utf8_target) {
4499 /* Match either CR LF or '.', as all the other possibilities
4501 locinput++; /* Match the . or CR */
4502 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4504 && locinput < PL_regeol
4505 && UCHARAT(locinput) == '\n')
4512 /* Utf8: See if is ( CR LF ); already know that locinput <
4513 * PL_regeol, so locinput+1 is in bounds */
4514 if ( nextchr == '\r' && locinput+1 < PL_regeol
4515 && UCHARAT(locinput + 1) == '\n')
4522 /* In case have to backtrack to beginning, then match '.' */
4523 char *starting = locinput;
4525 /* In case have to backtrack the last prepend */
4526 char *previous_prepend = NULL;
4528 LOAD_UTF8_CHARCLASS_GCB();
4530 /* Match (prepend)* */
4531 while (locinput < PL_regeol
4532 && (len = is_GCB_Prepend_utf8(locinput)))
4534 previous_prepend = locinput;
4538 /* As noted above, if we matched a prepend character, but
4539 * the next thing won't match, back off the last prepend we
4540 * matched, as it is guaranteed to match the begin */
4541 if (previous_prepend
4542 && (locinput >= PL_regeol
4543 || (! swash_fetch(PL_utf8_X_regular_begin,
4544 (U8*)locinput, utf8_target)
4545 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4548 locinput = previous_prepend;
4551 /* Note that here we know PL_regeol > locinput, as we
4552 * tested that upon input to this switch case, and if we
4553 * moved locinput forward, we tested the result just above
4554 * and it either passed, or we backed off so that it will
4556 if (swash_fetch(PL_utf8_X_regular_begin,
4557 (U8*)locinput, utf8_target)) {
4558 locinput += UTF8SKIP(locinput);
4560 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4562 /* Here did not match the required 'Begin' in the
4563 * second term. So just match the very first
4564 * character, the '.' of the final term of the regex */
4565 locinput = starting + UTF8SKIP(starting);
4569 /* Here is a special begin. It can be composed of
4570 * several individual characters. One possibility is
4572 if ((len = is_GCB_RI_utf8(locinput))) {
4574 while (locinput < PL_regeol
4575 && (len = is_GCB_RI_utf8(locinput)))
4579 } else if ((len = is_GCB_T_utf8(locinput))) {
4580 /* Another possibility is T+ */
4582 while (locinput < PL_regeol
4583 && (len = is_GCB_T_utf8(locinput)))
4589 /* Here, neither RI+ nor T+; must be some other
4590 * Hangul. That means it is one of the others: L,
4591 * LV, LVT or V, and matches:
4592 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4595 while (locinput < PL_regeol
4596 && (len = is_GCB_L_utf8(locinput)))
4601 /* Here, have exhausted L*. If the next character
4602 * is not an LV, LVT nor V, it means we had to have
4603 * at least one L, so matches L+ in the original
4604 * equation, we have a complete hangul syllable.
4607 if (locinput < PL_regeol
4608 && is_GCB_LV_LVT_V_utf8(locinput))
4610 /* Otherwise keep going. Must be LV, LVT or V.
4611 * See if LVT, by first ruling out V, then LV */
4612 if (! is_GCB_V_utf8(locinput)
4613 /* All but every TCount one is LV */
4614 && (valid_utf8_to_uvchr((U8 *) locinput,
4619 locinput += UTF8SKIP(locinput);
4622 /* Must be V or LV. Take it, then match
4624 locinput += UTF8SKIP(locinput);
4625 while (locinput < PL_regeol
4626 && (len = is_GCB_V_utf8(locinput)))
4632 /* And any of LV, LVT, or V can be followed
4634 while (locinput < PL_regeol
4635 && (len = is_GCB_T_utf8(locinput)))
4643 /* Match any extender */
4644 while (locinput < PL_regeol
4645 && swash_fetch(PL_utf8_X_extend,
4646 (U8*)locinput, utf8_target))
4648 locinput += UTF8SKIP(locinput);
4652 if (locinput > PL_regeol) sayNO;
4656 case NREFFL: /* /\g{name}/il */
4657 { /* The capture buffer cases. The ones beginning with N for the
4658 named buffers just convert to the equivalent numbered and
4659 pretend they were called as the corresponding numbered buffer
4661 /* don't initialize these in the declaration, it makes C++
4666 const U8 *fold_array;
4669 PL_reg_flags |= RF_tainted;
4670 folder = foldEQ_locale;
4671 fold_array = PL_fold_locale;
4673 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4676 case NREFFA: /* /\g{name}/iaa */
4677 folder = foldEQ_latin1;
4678 fold_array = PL_fold_latin1;
4680 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4683 case NREFFU: /* /\g{name}/iu */
4684 folder = foldEQ_latin1;
4685 fold_array = PL_fold_latin1;
4687 utf8_fold_flags = 0;
4690 case NREFF: /* /\g{name}/i */
4692 fold_array = PL_fold;
4694 utf8_fold_flags = 0;
4697 case NREF: /* /\g{name}/ */
4701 utf8_fold_flags = 0;
4704 /* For the named back references, find the corresponding buffer
4706 n = reg_check_named_buff_matched(rex,scan);
4711 goto do_nref_ref_common;
4713 case REFFL: /* /\1/il */
4714 PL_reg_flags |= RF_tainted;
4715 folder = foldEQ_locale;
4716 fold_array = PL_fold_locale;
4717 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4720 case REFFA: /* /\1/iaa */
4721 folder = foldEQ_latin1;
4722 fold_array = PL_fold_latin1;
4723 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4726 case REFFU: /* /\1/iu */
4727 folder = foldEQ_latin1;
4728 fold_array = PL_fold_latin1;
4729 utf8_fold_flags = 0;
4732 case REFF: /* /\1/i */
4734 fold_array = PL_fold;
4735 utf8_fold_flags = 0;
4738 case REF: /* /\1/ */
4741 utf8_fold_flags = 0;
4745 n = ARG(scan); /* which paren pair */
4748 ln = rex->offs[n].start;
4749 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4750 if (rex->lastparen < n || ln == -1)
4751 sayNO; /* Do not match unless seen CLOSEn. */
4752 if (ln == rex->offs[n].end)
4756 if (type != REF /* REF can do byte comparison */
4757 && (utf8_target || type == REFFU))
4758 { /* XXX handle REFFL better */
4759 char * limit = PL_regeol;
4761 /* This call case insensitively compares the entire buffer
4762 * at s, with the current input starting at locinput, but
4763 * not going off the end given by PL_regeol, and returns in
4764 * <limit> upon success, how much of the current input was
4766 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4767 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4775 /* Not utf8: Inline the first character, for speed. */
4776 if (!NEXTCHR_IS_EOS &&
4777 UCHARAT(s) != nextchr &&
4779 UCHARAT(s) != fold_array[nextchr]))
4781 ln = rex->offs[n].end - ln;
4782 if (locinput + ln > PL_regeol)
4784 if (ln > 1 && (type == REF
4785 ? memNE(s, locinput, ln)
4786 : ! folder(s, locinput, ln)))
4792 case NOTHING: /* null op; e.g. the 'nothing' following
4793 * the '*' in m{(a+|b)*}' */
4795 case TAIL: /* placeholder while compiling (A|B|C) */
4798 case BACK: /* ??? doesn't appear to be used ??? */
4802 #define ST st->u.eval
4807 regexp_internal *rei;
4808 regnode *startpoint;
4810 case GOSTART: /* (?R) */
4811 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4812 if (cur_eval && cur_eval->locinput==locinput) {
4813 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4814 Perl_croak(aTHX_ "Infinite recursion in regex");
4815 if ( ++nochange_depth > max_nochange_depth )
4817 "Pattern subroutine nesting without pos change"
4818 " exceeded limit in regex");
4825 if (OP(scan)==GOSUB) {
4826 startpoint = scan + ARG2L(scan);
4827 ST.close_paren = ARG(scan);
4829 startpoint = rei->program+1;
4832 goto eval_recurse_doit;
4833 assert(0); /* NOTREACHED */
4835 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4836 if (cur_eval && cur_eval->locinput==locinput) {
4837 if ( ++nochange_depth > max_nochange_depth )
4838 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4843 /* execute the code in the {...} */
4847 OP * const oop = PL_op;
4848 COP * const ocurcop = PL_curcop;
4850 char *saved_regeol = PL_regeol;
4851 struct re_save_state saved_state;
4854 /* save *all* paren positions */
4855 regcppush(rex, 0, maxopenparen);
4856 REGCP_SET(runops_cp);
4858 /* To not corrupt the existing regex state while executing the
4859 * eval we would normally put it on the save stack, like with
4860 * save_re_context. However, re-evals have a weird scoping so we
4861 * can't just add ENTER/LEAVE here. With that, things like
4863 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4865 * would break, as they expect the localisation to be unwound
4866 * only when the re-engine backtracks through the bit that
4869 * What we do instead is just saving the state in a local c
4872 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4874 PL_reg_state.re_reparsing = FALSE;
4877 caller_cv = find_runcv(NULL);
4881 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4883 (REGEXP*)(rexi->data->data[n])
4886 nop = (OP*)rexi->data->data[n+1];
4888 else if (rexi->data->what[n] == 'l') { /* literal code */
4890 nop = (OP*)rexi->data->data[n];
4891 assert(CvDEPTH(newcv));
4894 /* literal with own CV */
4895 assert(rexi->data->what[n] == 'L');
4896 newcv = rex->qr_anoncv;
4897 nop = (OP*)rexi->data->data[n];
4900 /* normally if we're about to execute code from the same
4901 * CV that we used previously, we just use the existing
4902 * CX stack entry. However, its possible that in the
4903 * meantime we may have backtracked, popped from the save
4904 * stack, and undone the SAVECOMPPAD(s) associated with
4905 * PUSH_MULTICALL; in which case PL_comppad no longer
4906 * points to newcv's pad. */
4907 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4909 I32 depth = (newcv == caller_cv) ? 0 : 1;
4910 if (last_pushed_cv) {
4911 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4914 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4916 last_pushed_cv = newcv;
4919 /* these assignments are just to silence compiler
4921 multicall_cop = NULL;
4924 last_pad = PL_comppad;
4926 /* the initial nextstate you would normally execute
4927 * at the start of an eval (which would cause error
4928 * messages to come from the eval), may be optimised
4929 * away from the execution path in the regex code blocks;
4930 * so manually set PL_curcop to it initially */
4932 OP *o = cUNOPx(nop)->op_first;
4933 assert(o->op_type == OP_NULL);
4934 if (o->op_targ == OP_SCOPE) {
4935 o = cUNOPo->op_first;
4938 assert(o->op_targ == OP_LEAVE);
4939 o = cUNOPo->op_first;
4940 assert(o->op_type == OP_ENTER);
4944 if (o->op_type != OP_STUB) {
4945 assert( o->op_type == OP_NEXTSTATE
4946 || o->op_type == OP_DBSTATE
4947 || (o->op_type == OP_NULL
4948 && ( o->op_targ == OP_NEXTSTATE
4949 || o->op_targ == OP_DBSTATE
4953 PL_curcop = (COP*)o;
4958 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4959 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4961 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4964 SV *sv_mrk = get_sv("REGMARK", 1);
4965 sv_setsv(sv_mrk, sv_yes_mark);
4968 /* we don't use MULTICALL here as we want to call the
4969 * first op of the block of interest, rather than the
4970 * first op of the sub */
4971 before = (IV)(SP-PL_stack_base);
4973 CALLRUNOPS(aTHX); /* Scalar context. */
4975 if ((IV)(SP-PL_stack_base) == before)
4976 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4982 /* before restoring everything, evaluate the returned
4983 * value, so that 'uninit' warnings don't use the wrong
4984 * PL_op or pad. Also need to process any magic vars
4985 * (e.g. $1) *before* parentheses are restored */
4990 if (logical == 0) /* (?{})/ */
4991 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4992 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4993 sw = cBOOL(SvTRUE(ret));
4996 else { /* /(??{}) */
4997 /* if its overloaded, let the regex compiler handle
4998 * it; otherwise extract regex, or stringify */
4999 if (!SvAMAGIC(ret)) {
5003 if (SvTYPE(sv) == SVt_REGEXP)
5004 re_sv = (REGEXP*) sv;
5005 else if (SvSMAGICAL(sv)) {
5006 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5008 re_sv = (REGEXP *) mg->mg_obj;
5011 /* force any magic, undef warnings here */
5013 ret = sv_mortalcopy(ret);
5014 (void) SvPV_force_nolen(ret);
5020 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5022 /* *** Note that at this point we don't restore
5023 * PL_comppad, (or pop the CxSUB) on the assumption it may
5024 * be used again soon. This is safe as long as nothing
5025 * in the regexp code uses the pad ! */
5027 PL_curcop = ocurcop;
5028 PL_regeol = saved_regeol;
5029 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5035 /* only /(??{})/ from now on */
5038 /* extract RE object from returned value; compiling if
5042 re_sv = reg_temp_copy(NULL, re_sv);
5047 if (SvUTF8(ret) && IN_BYTES) {
5048 /* In use 'bytes': make a copy of the octet
5049 * sequence, but without the flag on */
5051 const char *const p = SvPV(ret, len);
5052 ret = newSVpvn_flags(p, len, SVs_TEMP);
5054 if (rex->intflags & PREGf_USE_RE_EVAL)
5055 pm_flags |= PMf_USE_RE_EVAL;
5057 /* if we got here, it should be an engine which
5058 * supports compiling code blocks and stuff */
5059 assert(rex->engine && rex->engine->op_comp);
5060 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5061 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5062 rex->engine, NULL, NULL,
5063 /* copy /msix etc to inner pattern */
5068 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5070 /* This isn't a first class regexp. Instead, it's
5071 caching a regexp onto an existing, Perl visible
5073 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5075 /* safe to do now that any $1 etc has been
5076 * interpolated into the new pattern string and
5078 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5083 RXp_MATCH_COPIED_off(re);
5084 re->subbeg = rex->subbeg;
5085 re->sublen = rex->sublen;
5086 re->suboffset = rex->suboffset;
5087 re->subcoffset = rex->subcoffset;
5090 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
5091 "Matching embedded");
5093 startpoint = rei->program + 1;
5094 ST.close_paren = 0; /* only used for GOSUB */
5096 eval_recurse_doit: /* Share code with GOSUB below this line */
5097 /* run the pattern returned from (??{...}) */
5099 /* Save *all* the positions. */
5100 ST.cp = regcppush(rex, 0, maxopenparen);
5101 REGCP_SET(ST.lastcp);
5104 re->lastcloseparen = 0;
5108 /* XXXX This is too dramatic a measure... */
5111 ST.toggle_reg_flags = PL_reg_flags;
5113 PL_reg_flags |= RF_utf8;
5115 PL_reg_flags &= ~RF_utf8;
5116 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5118 ST.prev_rex = rex_sv;
5119 ST.prev_curlyx = cur_curlyx;
5121 SET_reg_curpm(rex_sv);
5126 ST.prev_eval = cur_eval;
5128 /* now continue from first node in postoned RE */
5129 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5130 assert(0); /* NOTREACHED */
5133 case EVAL_AB: /* cleanup after a successful (??{A})B */
5134 /* note: this is called twice; first after popping B, then A */
5135 PL_reg_flags ^= ST.toggle_reg_flags;
5136 rex_sv = ST.prev_rex;
5137 SET_reg_curpm(rex_sv);
5138 rex = ReANY(rex_sv);
5139 rexi = RXi_GET(rex);
5141 cur_eval = ST.prev_eval;
5142 cur_curlyx = ST.prev_curlyx;
5144 /* XXXX This is too dramatic a measure... */
5146 if ( nochange_depth )
5151 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5152 /* note: this is called twice; first after popping B, then A */
5153 PL_reg_flags ^= ST.toggle_reg_flags;
5154 rex_sv = ST.prev_rex;
5155 SET_reg_curpm(rex_sv);
5156 rex = ReANY(rex_sv);
5157 rexi = RXi_GET(rex);
5159 REGCP_UNWIND(ST.lastcp);
5160 regcppop(rex, &maxopenparen);
5161 cur_eval = ST.prev_eval;
5162 cur_curlyx = ST.prev_curlyx;
5163 /* XXXX This is too dramatic a measure... */
5165 if ( nochange_depth )
5171 n = ARG(scan); /* which paren pair */
5172 rex->offs[n].start_tmp = locinput - PL_bostr;
5173 if (n > maxopenparen)
5175 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5176 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5180 (IV)rex->offs[n].start_tmp,
5186 /* XXX really need to log other places start/end are set too */
5187 #define CLOSE_CAPTURE \
5188 rex->offs[n].start = rex->offs[n].start_tmp; \
5189 rex->offs[n].end = locinput - PL_bostr; \
5190 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5191 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5193 PTR2UV(rex->offs), \
5195 (IV)rex->offs[n].start, \
5196 (IV)rex->offs[n].end \
5200 n = ARG(scan); /* which paren pair */
5202 if (n > rex->lastparen)
5204 rex->lastcloseparen = n;
5205 if (cur_eval && cur_eval->u.eval.close_paren == n) {
5210 case ACCEPT: /* (*ACCEPT) */
5214 cursor && OP(cursor)!=END;
5215 cursor=regnext(cursor))
5217 if ( OP(cursor)==CLOSE ){
5219 if ( n <= lastopen ) {
5221 if (n > rex->lastparen)
5223 rex->lastcloseparen = n;
5224 if ( n == ARG(scan) || (cur_eval &&
5225 cur_eval->u.eval.close_paren == n))
5234 case GROUPP: /* (?(1)) */
5235 n = ARG(scan); /* which paren pair */
5236 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5239 case NGROUPP: /* (?(<name>)) */
5240 /* reg_check_named_buff_matched returns 0 for no match */
5241 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5244 case INSUBP: /* (?(R)) */
5246 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5249 case DEFINEP: /* (?(DEFINE)) */
5253 case IFTHEN: /* (?(cond)A|B) */
5254 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5256 next = NEXTOPER(NEXTOPER(scan));
5258 next = scan + ARG(scan);
5259 if (OP(next) == IFTHEN) /* Fake one. */
5260 next = NEXTOPER(NEXTOPER(next));
5264 case LOGICAL: /* modifier for EVAL and IFMATCH */
5265 logical = scan->flags;
5268 /*******************************************************************
5270 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5271 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5272 STAR/PLUS/CURLY/CURLYN are used instead.)
5274 A*B is compiled as <CURLYX><A><WHILEM><B>
5276 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5277 state, which contains the current count, initialised to -1. It also sets
5278 cur_curlyx to point to this state, with any previous value saved in the
5281 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5282 since the pattern may possibly match zero times (i.e. it's a while {} loop
5283 rather than a do {} while loop).
5285 Each entry to WHILEM represents a successful match of A. The count in the
5286 CURLYX block is incremented, another WHILEM state is pushed, and execution
5287 passes to A or B depending on greediness and the current count.
5289 For example, if matching against the string a1a2a3b (where the aN are
5290 substrings that match /A/), then the match progresses as follows: (the
5291 pushed states are interspersed with the bits of strings matched so far):
5294 <CURLYX cnt=0><WHILEM>
5295 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5296 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5297 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5298 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5300 (Contrast this with something like CURLYM, which maintains only a single
5304 a1 <CURLYM cnt=1> a2
5305 a1 a2 <CURLYM cnt=2> a3
5306 a1 a2 a3 <CURLYM cnt=3> b
5309 Each WHILEM state block marks a point to backtrack to upon partial failure
5310 of A or B, and also contains some minor state data related to that
5311 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5312 overall state, such as the count, and pointers to the A and B ops.
5314 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5315 must always point to the *current* CURLYX block, the rules are:
5317 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5318 and set cur_curlyx to point the new block.
5320 When popping the CURLYX block after a successful or unsuccessful match,
5321 restore the previous cur_curlyx.
5323 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5324 to the outer one saved in the CURLYX block.
5326 When popping the WHILEM block after a successful or unsuccessful B match,
5327 restore the previous cur_curlyx.
5329 Here's an example for the pattern (AI* BI)*BO
5330 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5333 curlyx backtrack stack
5334 ------ ---------------
5336 CO <CO prev=NULL> <WO>
5337 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5338 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5339 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5341 At this point the pattern succeeds, and we work back down the stack to
5342 clean up, restoring as we go:
5344 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5345 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5346 CO <CO prev=NULL> <WO>
5349 *******************************************************************/
5351 #define ST st->u.curlyx
5353 case CURLYX: /* start of /A*B/ (for complex A) */
5355 /* No need to save/restore up to this paren */
5356 I32 parenfloor = scan->flags;
5358 assert(next); /* keep Coverity happy */
5359 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5362 /* XXXX Probably it is better to teach regpush to support
5363 parenfloor > maxopenparen ... */
5364 if (parenfloor > (I32)rex->lastparen)
5365 parenfloor = rex->lastparen; /* Pessimization... */
5367 ST.prev_curlyx= cur_curlyx;
5369 ST.cp = PL_savestack_ix;
5371 /* these fields contain the state of the current curly.
5372 * they are accessed by subsequent WHILEMs */
5373 ST.parenfloor = parenfloor;
5378 ST.count = -1; /* this will be updated by WHILEM */
5379 ST.lastloc = NULL; /* this will be updated by WHILEM */
5381 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5382 assert(0); /* NOTREACHED */
5385 case CURLYX_end: /* just finished matching all of A*B */
5386 cur_curlyx = ST.prev_curlyx;
5388 assert(0); /* NOTREACHED */
5390 case CURLYX_end_fail: /* just failed to match all of A*B */
5392 cur_curlyx = ST.prev_curlyx;
5394 assert(0); /* NOTREACHED */
5398 #define ST st->u.whilem
5400 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5402 /* see the discussion above about CURLYX/WHILEM */
5404 int min = ARG1(cur_curlyx->u.curlyx.me);
5405 int max = ARG2(cur_curlyx->u.curlyx.me);
5406 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5408 assert(cur_curlyx); /* keep Coverity happy */
5409 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5410 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5411 ST.cache_offset = 0;
5415 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5416 "%*s whilem: matched %ld out of %d..%d\n",
5417 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5420 /* First just match a string of min A's. */
5423 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5425 cur_curlyx->u.curlyx.lastloc = locinput;
5426 REGCP_SET(ST.lastcp);
5428 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5429 assert(0); /* NOTREACHED */
5432 /* If degenerate A matches "", assume A done. */
5434 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5435 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5436 "%*s whilem: empty match detected, trying continuation...\n",
5437 REPORT_CODE_OFF+depth*2, "")
5439 goto do_whilem_B_max;
5442 /* super-linear cache processing */
5446 if (!PL_reg_maxiter) {
5447 /* start the countdown: Postpone detection until we
5448 * know the match is not *that* much linear. */
5449 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5450 /* possible overflow for long strings and many CURLYX's */
5451 if (PL_reg_maxiter < 0)
5452 PL_reg_maxiter = I32_MAX;
5453 PL_reg_leftiter = PL_reg_maxiter;
5456 if (PL_reg_leftiter-- == 0) {
5457 /* initialise cache */
5458 const I32 size = (PL_reg_maxiter + 7)/8;
5459 if (PL_reg_poscache) {
5460 if ((I32)PL_reg_poscache_size < size) {
5461 Renew(PL_reg_poscache, size, char);
5462 PL_reg_poscache_size = size;
5464 Zero(PL_reg_poscache, size, char);
5467 PL_reg_poscache_size = size;
5468 Newxz(PL_reg_poscache, size, char);
5470 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5471 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5472 PL_colors[4], PL_colors[5])
5476 if (PL_reg_leftiter < 0) {
5477 /* have we already failed at this position? */
5479 offset = (scan->flags & 0xf) - 1
5480 + (locinput - PL_bostr) * (scan->flags>>4);
5481 mask = 1 << (offset % 8);
5483 if (PL_reg_poscache[offset] & mask) {
5484 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5485 "%*s whilem: (cache) already tried at this position...\n",
5486 REPORT_CODE_OFF+depth*2, "")
5488 sayNO; /* cache records failure */
5490 ST.cache_offset = offset;
5491 ST.cache_mask = mask;
5495 /* Prefer B over A for minimal matching. */
5497 if (cur_curlyx->u.curlyx.minmod) {
5498 ST.save_curlyx = cur_curlyx;
5499 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5500 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5502 REGCP_SET(ST.lastcp);
5503 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5505 assert(0); /* NOTREACHED */
5508 /* Prefer A over B for maximal matching. */
5510 if (n < max) { /* More greed allowed? */
5511 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5513 cur_curlyx->u.curlyx.lastloc = locinput;
5514 REGCP_SET(ST.lastcp);
5515 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5516 assert(0); /* NOTREACHED */
5518 goto do_whilem_B_max;
5520 assert(0); /* NOTREACHED */
5522 case WHILEM_B_min: /* just matched B in a minimal match */
5523 case WHILEM_B_max: /* just matched B in a maximal match */
5524 cur_curlyx = ST.save_curlyx;
5526 assert(0); /* NOTREACHED */
5528 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5529 cur_curlyx = ST.save_curlyx;
5530 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5531 cur_curlyx->u.curlyx.count--;
5533 assert(0); /* NOTREACHED */
5535 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5537 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5538 REGCP_UNWIND(ST.lastcp);
5539 regcppop(rex, &maxopenparen);
5540 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5541 cur_curlyx->u.curlyx.count--;
5543 assert(0); /* NOTREACHED */
5545 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5546 REGCP_UNWIND(ST.lastcp);
5547 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5548 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5549 "%*s whilem: failed, trying continuation...\n",
5550 REPORT_CODE_OFF+depth*2, "")
5553 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5554 && ckWARN(WARN_REGEXP)
5555 && !(PL_reg_flags & RF_warned))
5557 PL_reg_flags |= RF_warned;
5558 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5559 "Complex regular subexpression recursion limit (%d) "
5565 ST.save_curlyx = cur_curlyx;
5566 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5567 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5569 assert(0); /* NOTREACHED */
5571 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5572 cur_curlyx = ST.save_curlyx;
5573 REGCP_UNWIND(ST.lastcp);
5574 regcppop(rex, &maxopenparen);
5576 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5577 /* Maximum greed exceeded */
5578 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5579 && ckWARN(WARN_REGEXP)
5580 && !(PL_reg_flags & RF_warned))
5582 PL_reg_flags |= RF_warned;
5583 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5584 "Complex regular subexpression recursion "
5585 "limit (%d) exceeded",
5588 cur_curlyx->u.curlyx.count--;
5592 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5593 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5595 /* Try grabbing another A and see if it helps. */
5596 cur_curlyx->u.curlyx.lastloc = locinput;
5597 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5599 REGCP_SET(ST.lastcp);
5600 PUSH_STATE_GOTO(WHILEM_A_min,
5601 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5603 assert(0); /* NOTREACHED */
5606 #define ST st->u.branch
5608 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5609 next = scan + ARG(scan);
5612 scan = NEXTOPER(scan);
5615 case BRANCH: /* /(...|A|...)/ */
5616 scan = NEXTOPER(scan); /* scan now points to inner node */
5617 ST.lastparen = rex->lastparen;
5618 ST.lastcloseparen = rex->lastcloseparen;
5619 ST.next_branch = next;
5622 /* Now go into the branch */
5624 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5626 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5628 assert(0); /* NOTREACHED */
5630 case CUTGROUP: /* /(*THEN)/ */
5631 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5632 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5633 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5634 assert(0); /* NOTREACHED */
5636 case CUTGROUP_next_fail:
5639 if (st->u.mark.mark_name)
5640 sv_commit = st->u.mark.mark_name;
5642 assert(0); /* NOTREACHED */
5646 assert(0); /* NOTREACHED */
5648 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5653 REGCP_UNWIND(ST.cp);
5654 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5655 scan = ST.next_branch;
5656 /* no more branches? */
5657 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5659 PerlIO_printf( Perl_debug_log,
5660 "%*s %sBRANCH failed...%s\n",
5661 REPORT_CODE_OFF+depth*2, "",
5667 continue; /* execute next BRANCH[J] op */
5668 assert(0); /* NOTREACHED */
5670 case MINMOD: /* next op will be non-greedy, e.g. A*? */
5675 #define ST st->u.curlym
5677 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5679 /* This is an optimisation of CURLYX that enables us to push
5680 * only a single backtracking state, no matter how many matches
5681 * there are in {m,n}. It relies on the pattern being constant
5682 * length, with no parens to influence future backrefs
5686 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5688 ST.lastparen = rex->lastparen;
5689 ST.lastcloseparen = rex->lastcloseparen;
5691 /* if paren positive, emulate an OPEN/CLOSE around A */
5693 U32 paren = ST.me->flags;
5694 if (paren > maxopenparen)
5695 maxopenparen = paren;
5696 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5704 ST.c1 = CHRTEST_UNINIT;
5707 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5710 curlym_do_A: /* execute the A in /A{m,n}B/ */
5711 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5712 assert(0); /* NOTREACHED */
5714 case CURLYM_A: /* we've just matched an A */
5716 /* after first match, determine A's length: u.curlym.alen */
5717 if (ST.count == 1) {
5718 if (PL_reg_match_utf8) {
5719 char *s = st->locinput;
5720 while (s < locinput) {
5726 ST.alen = locinput - st->locinput;
5729 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5732 PerlIO_printf(Perl_debug_log,
5733 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5734 (int)(REPORT_CODE_OFF+(depth*2)), "",
5735 (IV) ST.count, (IV)ST.alen)
5738 if (cur_eval && cur_eval->u.eval.close_paren &&
5739 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5743 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5744 if ( max == REG_INFTY || ST.count < max )
5745 goto curlym_do_A; /* try to match another A */
5747 goto curlym_do_B; /* try to match B */
5749 case CURLYM_A_fail: /* just failed to match an A */
5750 REGCP_UNWIND(ST.cp);
5752 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5753 || (cur_eval && cur_eval->u.eval.close_paren &&
5754 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5757 curlym_do_B: /* execute the B in /A{m,n}B/ */
5758 if (ST.c1 == CHRTEST_UNINIT) {
5759 /* calculate c1 and c2 for possible match of 1st char
5760 * following curly */
5761 ST.c1 = ST.c2 = CHRTEST_VOID;
5762 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5763 regnode *text_node = ST.B;
5764 if (! HAS_TEXT(text_node))
5765 FIND_NEXT_IMPT(text_node);
5768 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5770 But the former is redundant in light of the latter.
5772 if this changes back then the macro for
5773 IS_TEXT and friends need to change.
5775 if (PL_regkind[OP(text_node)] == EXACT) {
5776 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5777 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5786 PerlIO_printf(Perl_debug_log,
5787 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5788 (int)(REPORT_CODE_OFF+(depth*2)),
5791 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5792 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5793 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5794 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5796 /* simulate B failing */
5798 PerlIO_printf(Perl_debug_log,
5799 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5800 (int)(REPORT_CODE_OFF+(depth*2)),"",
5801 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5802 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5803 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5805 state_num = CURLYM_B_fail;
5806 goto reenter_switch;
5809 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5810 /* simulate B failing */
5812 PerlIO_printf(Perl_debug_log,
5813 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5814 (int)(REPORT_CODE_OFF+(depth*2)),"",
5815 (int) nextchr, ST.c1, ST.c2)
5817 state_num = CURLYM_B_fail;
5818 goto reenter_switch;
5823 /* emulate CLOSE: mark current A as captured */
5824 I32 paren = ST.me->flags;
5826 rex->offs[paren].start
5827 = HOPc(locinput, -ST.alen) - PL_bostr;
5828 rex->offs[paren].end = locinput - PL_bostr;
5829 if ((U32)paren > rex->lastparen)
5830 rex->lastparen = paren;
5831 rex->lastcloseparen = paren;
5834 rex->offs[paren].end = -1;
5835 if (cur_eval && cur_eval->u.eval.close_paren &&
5836 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5845 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5846 assert(0); /* NOTREACHED */
5848 case CURLYM_B_fail: /* just failed to match a B */
5849 REGCP_UNWIND(ST.cp);
5850 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5852 I32 max = ARG2(ST.me);
5853 if (max != REG_INFTY && ST.count == max)
5855 goto curlym_do_A; /* try to match a further A */
5857 /* backtrack one A */
5858 if (ST.count == ARG1(ST.me) /* min */)
5861 SET_locinput(HOPc(locinput, -ST.alen));
5862 goto curlym_do_B; /* try to match B */
5865 #define ST st->u.curly
5867 #define CURLY_SETPAREN(paren, success) \
5870 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5871 rex->offs[paren].end = locinput - PL_bostr; \
5872 if (paren > rex->lastparen) \
5873 rex->lastparen = paren; \
5874 rex->lastcloseparen = paren; \
5877 rex->offs[paren].end = -1; \
5878 rex->lastparen = ST.lastparen; \
5879 rex->lastcloseparen = ST.lastcloseparen; \
5883 case STAR: /* /A*B/ where A is width 1 char */
5887 scan = NEXTOPER(scan);
5890 case PLUS: /* /A+B/ where A is width 1 char */
5894 scan = NEXTOPER(scan);
5897 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5898 ST.paren = scan->flags; /* Which paren to set */
5899 ST.lastparen = rex->lastparen;
5900 ST.lastcloseparen = rex->lastcloseparen;
5901 if (ST.paren > maxopenparen)
5902 maxopenparen = ST.paren;
5903 ST.min = ARG1(scan); /* min to match */
5904 ST.max = ARG2(scan); /* max to match */
5905 if (cur_eval && cur_eval->u.eval.close_paren &&
5906 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5910 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5913 case CURLY: /* /A{m,n}B/ where A is width 1 char */
5915 ST.min = ARG1(scan); /* min to match */
5916 ST.max = ARG2(scan); /* max to match */
5917 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5920 * Lookahead to avoid useless match attempts
5921 * when we know what character comes next.
5923 * Used to only do .*x and .*?x, but now it allows
5924 * for )'s, ('s and (?{ ... })'s to be in the way
5925 * of the quantifier and the EXACT-like node. -- japhy
5928 assert(ST.min <= ST.max);
5929 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5930 ST.c1 = ST.c2 = CHRTEST_VOID;
5933 regnode *text_node = next;
5935 if (! HAS_TEXT(text_node))
5936 FIND_NEXT_IMPT(text_node);
5938 if (! HAS_TEXT(text_node))
5939 ST.c1 = ST.c2 = CHRTEST_VOID;
5941 if ( PL_regkind[OP(text_node)] != EXACT ) {
5942 ST.c1 = ST.c2 = CHRTEST_VOID;
5946 /* Currently we only get here when
5948 PL_rekind[OP(text_node)] == EXACT
5950 if this changes back then the macro for IS_TEXT and
5951 friends need to change. */
5952 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5953 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5964 char *li = locinput;
5966 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5971 if (ST.c1 == CHRTEST_VOID)
5972 goto curly_try_B_min;
5974 ST.oldloc = locinput;
5976 /* set ST.maxpos to the furthest point along the
5977 * string that could possibly match */
5978 if (ST.max == REG_INFTY) {
5979 ST.maxpos = PL_regeol - 1;
5981 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5984 else if (utf8_target) {
5985 int m = ST.max - ST.min;
5986 for (ST.maxpos = locinput;
5987 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5988 ST.maxpos += UTF8SKIP(ST.maxpos);
5991 ST.maxpos = locinput + ST.max - ST.min;
5992 if (ST.maxpos >= PL_regeol)
5993 ST.maxpos = PL_regeol - 1;
5995 goto curly_try_B_min_known;
5999 /* avoid taking address of locinput, so it can remain
6001 char *li = locinput;
6002 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
6003 if (ST.count < ST.min)
6006 if ((ST.count > ST.min)
6007 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6009 /* A{m,n} must come at the end of the string, there's
6010 * no point in backing off ... */
6012 /* ...except that $ and \Z can match before *and* after
6013 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6014 We may back off by one in this case. */
6015 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6019 goto curly_try_B_max;
6021 assert(0); /* NOTREACHED */
6024 case CURLY_B_min_known_fail:
6025 /* failed to find B in a non-greedy match where c1,c2 valid */
6027 REGCP_UNWIND(ST.cp);
6029 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6031 /* Couldn't or didn't -- move forward. */
6032 ST.oldloc = locinput;
6034 locinput += UTF8SKIP(locinput);
6038 curly_try_B_min_known:
6039 /* find the next place where 'B' could work, then call B */
6043 n = (ST.oldloc == locinput) ? 0 : 1;
6044 if (ST.c1 == ST.c2) {
6045 /* set n to utf8_distance(oldloc, locinput) */
6046 while (locinput <= ST.maxpos
6047 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6049 locinput += UTF8SKIP(locinput);
6054 /* set n to utf8_distance(oldloc, locinput) */
6055 while (locinput <= ST.maxpos
6056 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6057 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6059 locinput += UTF8SKIP(locinput);
6064 else { /* Not utf8_target */
6065 if (ST.c1 == ST.c2) {
6066 while (locinput <= ST.maxpos &&
6067 UCHARAT(locinput) != ST.c1)
6071 while (locinput <= ST.maxpos
6072 && UCHARAT(locinput) != ST.c1
6073 && UCHARAT(locinput) != ST.c2)
6076 n = locinput - ST.oldloc;
6078 if (locinput > ST.maxpos)
6081 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6082 * at b; check that everything between oldloc and
6083 * locinput matches */
6084 char *li = ST.oldloc;
6086 if (regrepeat(rex, &li, ST.A, n, depth) < n)
6088 assert(n == REG_INFTY || locinput == li);
6090 CURLY_SETPAREN(ST.paren, ST.count);
6091 if (cur_eval && cur_eval->u.eval.close_paren &&
6092 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6095 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6097 assert(0); /* NOTREACHED */
6100 case CURLY_B_min_fail:
6101 /* failed to find B in a non-greedy match where c1,c2 invalid */
6103 REGCP_UNWIND(ST.cp);
6105 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6107 /* failed -- move forward one */
6109 char *li = locinput;
6110 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
6117 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6118 ST.count > 0)) /* count overflow ? */
6121 CURLY_SETPAREN(ST.paren, ST.count);
6122 if (cur_eval && cur_eval->u.eval.close_paren &&
6123 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6126 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6130 assert(0); /* NOTREACHED */
6134 /* a successful greedy match: now try to match B */
6135 if (cur_eval && cur_eval->u.eval.close_paren &&
6136 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6140 bool could_match = locinput < PL_regeol;
6142 /* If it could work, try it. */
6143 if (ST.c1 != CHRTEST_VOID && could_match) {
6144 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6146 could_match = memEQ(locinput,
6151 UTF8SKIP(locinput));
6154 could_match = UCHARAT(locinput) == ST.c1
6155 || UCHARAT(locinput) == ST.c2;
6158 if (ST.c1 == CHRTEST_VOID || could_match) {
6159 CURLY_SETPAREN(ST.paren, ST.count);
6160 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6161 assert(0); /* NOTREACHED */
6166 case CURLY_B_max_fail:
6167 /* failed to find B in a greedy match */
6169 REGCP_UNWIND(ST.cp);
6171 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6174 if (--ST.count < ST.min)
6176 locinput = HOPc(locinput, -1);
6177 goto curly_try_B_max;
6181 case END: /* last op of main pattern */
6184 /* we've just finished A in /(??{A})B/; now continue with B */
6185 st->u.eval.toggle_reg_flags
6186 = cur_eval->u.eval.toggle_reg_flags;
6187 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6189 st->u.eval.prev_rex = rex_sv; /* inner */
6191 /* Save *all* the positions. */
6192 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6193 rex_sv = cur_eval->u.eval.prev_rex;
6194 SET_reg_curpm(rex_sv);
6195 rex = ReANY(rex_sv);
6196 rexi = RXi_GET(rex);
6197 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6199 REGCP_SET(st->u.eval.lastcp);
6201 /* Restore parens of the outer rex without popping the
6203 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6206 st->u.eval.prev_eval = cur_eval;
6207 cur_eval = cur_eval->u.eval.prev_eval;
6209 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6210 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6211 if ( nochange_depth )
6214 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6215 locinput); /* match B */
6218 if (locinput < reginfo->till) {
6219 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6220 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6222 (long)(locinput - PL_reg_starttry),
6223 (long)(reginfo->till - PL_reg_starttry),
6226 sayNO_SILENT; /* Cannot match: too short. */
6228 sayYES; /* Success! */
6230 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6232 PerlIO_printf(Perl_debug_log,
6233 "%*s %ssubpattern success...%s\n",
6234 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6235 sayYES; /* Success! */
6238 #define ST st->u.ifmatch
6243 case SUSPEND: /* (?>A) */
6245 newstart = locinput;
6248 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6250 goto ifmatch_trivial_fail_test;
6252 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6254 ifmatch_trivial_fail_test:
6256 char * const s = HOPBACKc(locinput, scan->flags);
6261 sw = 1 - cBOOL(ST.wanted);
6265 next = scan + ARG(scan);
6273 newstart = locinput;
6277 ST.logical = logical;
6278 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6280 /* execute body of (?...A) */
6281 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6282 assert(0); /* NOTREACHED */
6285 case IFMATCH_A_fail: /* body of (?...A) failed */
6286 ST.wanted = !ST.wanted;
6289 case IFMATCH_A: /* body of (?...A) succeeded */
6291 sw = cBOOL(ST.wanted);
6293 else if (!ST.wanted)
6296 if (OP(ST.me) != SUSPEND) {
6297 /* restore old position except for (?>...) */
6298 locinput = st->locinput;
6300 scan = ST.me + ARG(ST.me);
6303 continue; /* execute B */
6307 case LONGJMP: /* alternative with many branches compiles to
6308 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6309 next = scan + ARG(scan);
6314 case COMMIT: /* (*COMMIT) */
6315 reginfo->cutpoint = PL_regeol;
6318 case PRUNE: /* (*PRUNE) */
6320 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6321 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6322 assert(0); /* NOTREACHED */
6324 case COMMIT_next_fail:
6328 case OPFAIL: /* (*FAIL) */
6330 assert(0); /* NOTREACHED */
6332 #define ST st->u.mark
6333 case MARKPOINT: /* (*MARK:foo) */
6334 ST.prev_mark = mark_state;
6335 ST.mark_name = sv_commit = sv_yes_mark
6336 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6338 ST.mark_loc = locinput;
6339 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6340 assert(0); /* NOTREACHED */
6342 case MARKPOINT_next:
6343 mark_state = ST.prev_mark;
6345 assert(0); /* NOTREACHED */
6347 case MARKPOINT_next_fail:
6348 if (popmark && sv_eq(ST.mark_name,popmark))
6350 if (ST.mark_loc > startpoint)
6351 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6352 popmark = NULL; /* we found our mark */
6353 sv_commit = ST.mark_name;
6356 PerlIO_printf(Perl_debug_log,
6357 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6358 REPORT_CODE_OFF+depth*2, "",
6359 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6362 mark_state = ST.prev_mark;
6363 sv_yes_mark = mark_state ?
6364 mark_state->u.mark.mark_name : NULL;
6366 assert(0); /* NOTREACHED */
6368 case SKIP: /* (*SKIP) */
6370 /* (*SKIP) : if we fail we cut here*/
6371 ST.mark_name = NULL;
6372 ST.mark_loc = locinput;
6373 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6375 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6376 otherwise do nothing. Meaning we need to scan
6378 regmatch_state *cur = mark_state;
6379 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6382 if ( sv_eq( cur->u.mark.mark_name,
6385 ST.mark_name = find;
6386 PUSH_STATE_GOTO( SKIP_next, next, locinput);
6388 cur = cur->u.mark.prev_mark;
6391 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6394 case SKIP_next_fail:
6396 /* (*CUT:NAME) - Set up to search for the name as we
6397 collapse the stack*/
6398 popmark = ST.mark_name;
6400 /* (*CUT) - No name, we cut here.*/
6401 if (ST.mark_loc > startpoint)
6402 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6403 /* but we set sv_commit to latest mark_name if there
6404 is one so they can test to see how things lead to this
6407 sv_commit=mark_state->u.mark.mark_name;
6411 assert(0); /* NOTREACHED */
6414 case LNBREAK: /* \R */
6415 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6421 #define CASE_CLASS(nAmE) \
6423 if (NEXTCHR_IS_EOS) \
6425 if ((n=is_##nAmE(locinput,utf8_target))) { \
6431 if (NEXTCHR_IS_EOS) \
6433 if ((n=is_##nAmE(locinput,utf8_target))) { \
6436 locinput += UTF8SKIP(locinput); \
6440 CASE_CLASS(VERTWS); /* \v \V */
6441 CASE_CLASS(HORIZWS); /* \h \H */
6445 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6446 PTR2UV(scan), OP(scan));
6447 Perl_croak(aTHX_ "regexp memory corruption");
6449 /* this is a point to jump to in order to increment
6450 * locinput by one character */
6452 assert(!NEXTCHR_IS_EOS);
6454 locinput += PL_utf8skip[nextchr];
6455 /* locinput is allowed to go 1 char off the end, but not 2+ */
6456 if (locinput > PL_regeol)
6465 /* switch break jumps here */
6466 scan = next; /* prepare to execute the next op and ... */
6467 continue; /* ... jump back to the top, reusing st */
6468 assert(0); /* NOTREACHED */
6471 /* push a state that backtracks on success */
6472 st->u.yes.prev_yes_state = yes_state;
6476 /* push a new regex state, then continue at scan */
6478 regmatch_state *newst;
6481 regmatch_state *cur = st;
6482 regmatch_state *curyes = yes_state;
6484 regmatch_slab *slab = PL_regmatch_slab;
6485 for (;curd > -1;cur--,curd--) {
6486 if (cur < SLAB_FIRST(slab)) {
6488 cur = SLAB_LAST(slab);
6490 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6491 REPORT_CODE_OFF + 2 + depth * 2,"",
6492 curd, PL_reg_name[cur->resume_state],
6493 (curyes == cur) ? "yes" : ""
6496 curyes = cur->u.yes.prev_yes_state;
6499 DEBUG_STATE_pp("push")
6502 st->locinput = locinput;
6504 if (newst > SLAB_LAST(PL_regmatch_slab))
6505 newst = S_push_slab(aTHX);
6506 PL_regmatch_state = newst;
6508 locinput = pushinput;
6511 assert(0); /* NOTREACHED */
6516 * We get here only if there's trouble -- normally "case END" is
6517 * the terminating point.
6519 Perl_croak(aTHX_ "corrupted regexp pointers");
6525 /* we have successfully completed a subexpression, but we must now
6526 * pop to the state marked by yes_state and continue from there */
6527 assert(st != yes_state);
6529 while (st != yes_state) {
6531 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6532 PL_regmatch_slab = PL_regmatch_slab->prev;
6533 st = SLAB_LAST(PL_regmatch_slab);
6537 DEBUG_STATE_pp("pop (no final)");
6539 DEBUG_STATE_pp("pop (yes)");
6545 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6546 || yes_state > SLAB_LAST(PL_regmatch_slab))
6548 /* not in this slab, pop slab */
6549 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6550 PL_regmatch_slab = PL_regmatch_slab->prev;
6551 st = SLAB_LAST(PL_regmatch_slab);
6553 depth -= (st - yes_state);
6556 yes_state = st->u.yes.prev_yes_state;
6557 PL_regmatch_state = st;
6560 locinput= st->locinput;
6561 state_num = st->resume_state + no_final;
6562 goto reenter_switch;
6565 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6566 PL_colors[4], PL_colors[5]));
6568 if (PL_reg_state.re_state_eval_setup_done) {
6569 /* each successfully executed (?{...}) block does the equivalent of
6570 * local $^R = do {...}
6571 * When popping the save stack, all these locals would be undone;
6572 * bypass this by setting the outermost saved $^R to the latest
6574 if (oreplsv != GvSV(PL_replgv))
6575 sv_setsv(oreplsv, GvSV(PL_replgv));
6582 PerlIO_printf(Perl_debug_log,
6583 "%*s %sfailed...%s\n",
6584 REPORT_CODE_OFF+depth*2, "",
6585 PL_colors[4], PL_colors[5])
6597 /* there's a previous state to backtrack to */
6599 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6600 PL_regmatch_slab = PL_regmatch_slab->prev;
6601 st = SLAB_LAST(PL_regmatch_slab);
6603 PL_regmatch_state = st;
6604 locinput= st->locinput;
6606 DEBUG_STATE_pp("pop");
6608 if (yes_state == st)
6609 yes_state = st->u.yes.prev_yes_state;
6611 state_num = st->resume_state + 1; /* failure = success + 1 */
6612 goto reenter_switch;
6617 if (rex->intflags & PREGf_VERBARG_SEEN) {
6618 SV *sv_err = get_sv("REGERROR", 1);
6619 SV *sv_mrk = get_sv("REGMARK", 1);
6621 sv_commit = &PL_sv_no;
6623 sv_yes_mark = &PL_sv_yes;
6626 sv_commit = &PL_sv_yes;
6627 sv_yes_mark = &PL_sv_no;
6629 sv_setsv(sv_err, sv_commit);
6630 sv_setsv(sv_mrk, sv_yes_mark);
6634 if (last_pushed_cv) {
6637 PERL_UNUSED_VAR(SP);
6640 /* clean up; in particular, free all slabs above current one */
6641 LEAVE_SCOPE(oldsave);
6643 assert(!result || locinput - PL_bostr >= 0);
6644 return result ? locinput - PL_bostr : -1;
6648 - regrepeat - repeatedly match something simple, report how many
6650 * What 'simple' means is a node which can be the operand of a quantifier like
6653 * startposp - pointer a pointer to the start position. This is updated
6654 * to point to the byte following the highest successful
6656 * p - the regnode to be repeatedly matched against.
6657 * max - maximum number of things to match.
6658 * depth - (for debugging) backtracking depth.
6661 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6664 char *scan; /* Pointer to current position in target string */
6666 char *loceol = PL_regeol; /* local version */
6667 I32 hardcount = 0; /* How many matches so far */
6668 bool utf8_target = PL_reg_match_utf8;
6671 PERL_UNUSED_ARG(depth);
6674 PERL_ARGS_ASSERT_REGREPEAT;
6677 if (max == REG_INFTY)
6679 else if (! utf8_target && scan + max < loceol)
6680 loceol = scan + max;
6682 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6683 * to the maximum of how far we should go in it (leaving it set to the real
6684 * end, if the maximum permissible would take us beyond that). This allows
6685 * us to make the loop exit condition that we haven't gone past <loceol> to
6686 * also mean that we haven't exceeded the max permissible count, saving a
6687 * test each time through the loop. But it assumes that the OP matches a
6688 * single byte, which is true for most of the OPs below when applied to a
6689 * non-UTF-8 target. Those relatively few OPs that don't have this
6690 * characteristic will have to compensate.
6692 * There is no adjustment for UTF-8 targets, as the number of bytes per
6693 * character varies. OPs will have to test both that the count is less
6694 * than the max permissible (using <hardcount> to keep track), and that we
6695 * are still within the bounds of the string (using <loceol>. A few OPs
6696 * match a single byte no matter what the encoding. They can omit the max
6697 * test if, for the UTF-8 case, they do the adjustment that was skipped
6700 * Thus, the code above sets things up for the common case; and exceptional
6701 * cases need extra work; the common case is to make sure <scan> doesn't
6702 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6703 * count doesn't exceed the maximum permissible */
6708 while (scan < loceol && hardcount < max && *scan != '\n') {
6709 scan += UTF8SKIP(scan);
6713 while (scan < loceol && *scan != '\n')
6719 while (scan < loceol && hardcount < max) {
6720 scan += UTF8SKIP(scan);
6727 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6728 if (utf8_target && scan + max < loceol) {
6730 /* <loceol> hadn't been adjusted in the UTF-8 case */
6738 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6742 /* Can use a simple loop if the pattern char to match on is invariant
6743 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6744 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6745 * true iff it doesn't matter if the argument is in UTF-8 or not */
6746 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6747 if (utf8_target && scan + max < loceol) {
6748 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6749 * since here, to match at all, 1 char == 1 byte */
6750 loceol = scan + max;
6752 while (scan < loceol && UCHARAT(scan) == c) {
6756 else if (UTF_PATTERN) {
6758 STRLEN scan_char_len;
6760 /* When both target and pattern are UTF-8, we have to do
6762 while (hardcount < max
6763 && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6764 && scan_char_len <= STR_LEN(p)
6765 && memEQ(scan, STRING(p), scan_char_len))
6767 scan += scan_char_len;
6771 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6773 /* Target isn't utf8; convert the character in the UTF-8
6774 * pattern to non-UTF8, and do a simple loop */
6775 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6776 while (scan < loceol && UCHARAT(scan) == c) {
6779 } /* else pattern char is above Latin1, can't possibly match the
6784 /* Here, the string must be utf8; pattern isn't, and <c> is
6785 * different in utf8 than not, so can't compare them directly.
6786 * Outside the loop, find the two utf8 bytes that represent c, and
6787 * then look for those in sequence in the utf8 string */
6788 U8 high = UTF8_TWO_BYTE_HI(c);
6789 U8 low = UTF8_TWO_BYTE_LO(c);
6791 while (hardcount < max
6792 && scan + 1 < loceol
6793 && UCHARAT(scan) == high
6794 && UCHARAT(scan + 1) == low)
6803 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6807 PL_reg_flags |= RF_tainted;
6808 utf8_flags = FOLDEQ_UTF8_LOCALE;
6816 case EXACTFU_TRICKYFOLD:
6818 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6822 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6824 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6826 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6827 if (c1 == CHRTEST_VOID) {
6828 /* Use full Unicode fold matching */
6829 char *tmpeol = PL_regeol;
6830 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6831 while (hardcount < max
6832 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6833 STRING(p), NULL, pat_len,
6834 cBOOL(UTF_PATTERN), utf8_flags))
6841 else if (utf8_target) {
6843 while (scan < loceol
6845 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6847 scan += UTF8SKIP(scan);
6852 while (scan < loceol
6854 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6855 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6857 scan += UTF8SKIP(scan);
6862 else if (c1 == c2) {
6863 while (scan < loceol && UCHARAT(scan) == c1) {
6868 while (scan < loceol &&
6869 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6880 while (hardcount < max
6881 && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
6882 && reginclass(prog, p, (U8*)scan, utf8_target))
6888 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6895 LOAD_UTF8_CHARCLASS_ALNUM();
6896 while (hardcount < max && scan < loceol &&
6897 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6899 scan += UTF8SKIP(scan);
6903 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6911 while (scan < loceol && isALNUM((U8) *scan)) {
6916 if (utf8_target && scan + max < loceol) {
6918 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6919 * since here, to match, 1 char == 1 byte */
6920 loceol = scan + max;
6922 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6927 PL_reg_flags |= RF_tainted;
6929 while (hardcount < max && scan < loceol &&
6930 isALNUM_LC_utf8((U8*)scan)) {
6931 scan += UTF8SKIP(scan);
6935 while (scan < loceol && isALNUM_LC(*scan))
6944 LOAD_UTF8_CHARCLASS_ALNUM();
6945 while (hardcount < max && scan < loceol &&
6946 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6948 scan += UTF8SKIP(scan);
6952 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6959 goto utf8_Nwordchar;
6960 while (scan < loceol && ! isALNUM((U8) *scan)) {
6966 if (utf8_target && scan + max < loceol) {
6968 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6969 * since here, to match, 1 char == 1 byte */
6970 loceol = scan + max;
6972 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6978 while (scan < loceol && hardcount < max
6979 && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6981 scan += UTF8SKIP(scan);
6986 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6993 while (scan < loceol && hardcount < max
6994 && ! isWORDCHAR_A((U8) *scan))
6996 scan += UTF8SKIP(scan);
7001 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
7007 PL_reg_flags |= RF_tainted;
7009 while (hardcount < max && scan < loceol &&
7010 !isALNUM_LC_utf8((U8*)scan)) {
7011 scan += UTF8SKIP(scan);
7015 while (scan < loceol && !isALNUM_LC(*scan))
7024 while (hardcount < max && scan < loceol
7025 && is_XPERLSPACE_utf8((U8*)scan))
7027 scan += UTF8SKIP(scan);
7033 while (scan < loceol && isSPACE_L1((U8) *scan)) {
7042 while (scan < loceol && isSPACE((U8) *scan)) {
7047 if (utf8_target && scan + max < loceol) {
7049 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7050 * since here, to match, 1 char == 1 byte */
7051 loceol = scan + max;
7053 while (scan < loceol && isSPACE_A((U8) *scan)) {
7058 PL_reg_flags |= RF_tainted;
7060 while (hardcount < max && scan < loceol &&
7061 isSPACE_LC_utf8((U8*)scan)) {
7062 scan += UTF8SKIP(scan);
7066 while (scan < loceol && isSPACE_LC(*scan))
7075 while (hardcount < max && scan < loceol
7076 && ! is_XPERLSPACE_utf8((U8*)scan))
7078 scan += UTF8SKIP(scan);
7084 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
7093 while (scan < loceol && ! isSPACE((U8) *scan)) {
7099 while (hardcount < max && scan < loceol
7100 && ! isSPACE_A((U8) *scan))
7102 scan += UTF8SKIP(scan);
7107 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
7113 PL_reg_flags |= RF_tainted;
7115 while (hardcount < max && scan < loceol &&
7116 !isSPACE_LC_utf8((U8*)scan)) {
7117 scan += UTF8SKIP(scan);
7121 while (scan < loceol && !isSPACE_LC(*scan))
7127 LOAD_UTF8_CHARCLASS_DIGIT();
7128 while (hardcount < max && scan < loceol &&
7129 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7130 scan += UTF8SKIP(scan);
7134 while (scan < loceol && isDIGIT(*scan))
7139 if (utf8_target && scan + max < loceol) {
7141 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7142 * since here, to match, 1 char == 1 byte */
7143 loceol = scan + max;
7145 while (scan < loceol && isDIGIT_A((U8) *scan)) {
7150 PL_reg_flags |= RF_tainted;
7152 while (hardcount < max && scan < loceol &&
7153 isDIGIT_LC_utf8((U8*)scan)) {
7154 scan += UTF8SKIP(scan);
7158 while (scan < loceol && isDIGIT_LC(*scan))
7164 LOAD_UTF8_CHARCLASS_DIGIT();
7165 while (hardcount < max && scan < loceol &&
7166 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7167 scan += UTF8SKIP(scan);
7171 while (scan < loceol && !isDIGIT(*scan))
7177 while (hardcount < max && scan < loceol
7178 && ! isDIGIT_A((U8) *scan)) {
7179 scan += UTF8SKIP(scan);
7184 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7190 PL_reg_flags |= RF_tainted;
7192 while (hardcount < max && scan < loceol &&
7193 !isDIGIT_LC_utf8((U8*)scan)) {
7194 scan += UTF8SKIP(scan);
7198 while (scan < loceol && !isDIGIT_LC(*scan))
7204 while (hardcount < max && scan < loceol &&
7205 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7210 /* LNBREAK can match one or two latin chars, which is ok, but we
7211 * have to use hardcount in this situation, and throw away the
7212 * adjustment to <loceol> done before the switch statement */
7214 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7222 while (hardcount < max && scan < loceol &&
7223 (c=is_HORIZWS_utf8_safe(scan, loceol)))
7229 while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7235 while (hardcount < max && scan < loceol &&
7236 !is_HORIZWS_utf8_safe(scan, loceol))
7238 scan += UTF8SKIP(scan);
7242 while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7249 while (hardcount < max && scan < loceol &&
7250 (c=is_VERTWS_utf8_safe(scan, loceol)))
7256 while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7263 while (hardcount < max && scan < loceol &&
7264 !is_VERTWS_utf8_safe(scan, loceol))
7266 scan += UTF8SKIP(scan);
7270 while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7290 /* These are all 0 width, so match right here or not at all. */
7294 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7295 assert(0); /* NOTREACHED */
7302 c = scan - *startposp;
7306 GET_RE_DEBUG_FLAGS_DECL;
7308 SV * const prop = sv_newmortal();
7309 regprop(prog, prop, p);
7310 PerlIO_printf(Perl_debug_log,
7311 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7312 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7320 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7322 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7323 create a copy so that changes the caller makes won't change the shared one.
7324 If <altsvp> is non-null, will return NULL in it, for back-compat.
7327 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7329 PERL_ARGS_ASSERT_REGCLASS_SWASH;
7335 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7340 S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7342 /* Returns the swash for the input 'node' in the regex 'prog'.
7343 * If <doinit> is true, will attempt to create the swash if not already
7345 * If <listsvp> is non-null, will return the swash initialization string in
7347 * Tied intimately to how regcomp.c sets up the data structure */
7354 RXi_GET_DECL(prog,progi);
7355 const struct reg_data * const data = prog ? progi->data : NULL;
7357 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7359 assert(ANYOF_NONBITMAP(node));
7361 if (data && data->count) {
7362 const U32 n = ARG(node);
7364 if (data->what[n] == 's') {
7365 SV * const rv = MUTABLE_SV(data->data[n]);
7366 AV * const av = MUTABLE_AV(SvRV(rv));
7367 SV **const ary = AvARRAY(av);
7368 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7370 si = *ary; /* ary[0] = the string to initialize the swash with */
7372 /* Elements 2 and 3 are either both present or both absent. [2] is
7373 * any inversion list generated at compile time; [3] indicates if
7374 * that inversion list has any user-defined properties in it. */
7375 if (av_len(av) >= 2) {
7378 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7385 /* Element [1] is reserved for the set-up swash. If already there,
7386 * return it; if not, create it and store it there */
7387 if (SvROK(ary[1])) {
7390 else if (si && doinit) {
7392 sw = _core_swash_init("utf8", /* the utf8 package */
7396 0, /* not from tr/// */
7399 (void)av_store(av, 1, sw);
7405 SV* matches_string = newSVpvn("", 0);
7407 /* Use the swash, if any, which has to have incorporated into it all
7409 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7410 && (si && si != &PL_sv_undef))
7413 /* If no swash, use the input initialization string, if available */
7414 sv_catsv(matches_string, si);
7417 /* Add the inversion list to whatever we have. This may have come from
7418 * the swash, or from an input parameter */
7420 sv_catsv(matches_string, _invlist_contents(invlist));
7422 *listsvp = matches_string;
7429 - reginclass - determine if a character falls into a character class
7431 n is the ANYOF regnode
7432 p is the target string
7433 utf8_target tells whether p is in UTF-8.
7435 Returns true if matched; false otherwise.
7437 Note that this can be a synthetic start class, a combination of various
7438 nodes, so things you think might be mutually exclusive, such as locale,
7439 aren't. It can match both locale and non-locale
7444 S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7447 const char flags = ANYOF_FLAGS(n);
7451 PERL_ARGS_ASSERT_REGINCLASS;
7453 /* If c is not already the code point, get it. Note that
7454 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7455 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7457 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7458 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7459 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7460 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7461 * UTF8_ALLOW_FFFF */
7462 if (c_len == (STRLEN)-1)
7463 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7466 /* If this character is potentially in the bitmap, check it */
7468 if (ANYOF_BITMAP_TEST(n, c))
7470 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7476 else if (flags & ANYOF_LOCALE) {
7477 PL_reg_flags |= RF_tainted;
7479 if ((flags & ANYOF_LOC_FOLD)
7480 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7484 else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7486 /* The data structure is arranged so bits 0, 2, 4, ... are set
7487 * if the class includes the Posix character class given by
7488 * bit/2; and 1, 3, 5, ... are set if the class includes the
7489 * complemented Posix class given by int(bit/2). So we loop
7490 * through the bits, each time changing whether we complement
7491 * the result or not. Suppose for the sake of illustration
7492 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7493 * is set, it means there is a match for this ANYOF node if the
7494 * character is in the class given by the expression (0 / 2 = 0
7495 * = \w). If it is in that class, isFOO_lc() will return 1,
7496 * and since 'to_complement' is 0, the result will stay TRUE,
7497 * and we exit the loop. Suppose instead that bit 0 is 0, but
7498 * bit 1 is 1. That means there is a match if the character
7499 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7500 * but will on bit 1. On the second iteration 'to_complement'
7501 * will be 1, so the exclusive or will reverse things, so we
7502 * are testing for \W. On the third iteration, 'to_complement'
7503 * will be 0, and we would be testing for \s; the fourth
7504 * iteration would test for \S, etc. */
7507 int to_complement = 0;
7508 while (count < ANYOF_MAX) {
7509 if (ANYOF_CLASS_TEST(n, count)
7510 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7522 /* If the bitmap didn't (or couldn't) match, and something outside the
7523 * bitmap could match, try that. Locale nodes specify completely the
7524 * behavior of code points in the bit map (otherwise, a utf8 target would
7525 * cause them to be treated as Unicode and not locale), except in
7526 * the very unlikely event when this node is a synthetic start class, which
7527 * could be a combination of locale and non-locale nodes. So allow locale
7528 * to match for the synthetic start class, which will give a false
7529 * positive that will be resolved when the match is done again as not part
7530 * of the synthetic start class */
7532 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7533 match = TRUE; /* Everything above 255 matches */
7535 else if (ANYOF_NONBITMAP(n)
7536 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7539 || (! (flags & ANYOF_LOCALE))
7540 || (flags & ANYOF_IS_SYNTHETIC)))))
7542 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7547 } else { /* Convert to utf8 */
7549 utf8_p = bytes_to_utf8(p, &len);
7552 if (swash_fetch(sw, utf8_p, TRUE)) {
7556 /* If we allocated a string above, free it */
7557 if (! utf8_target) Safefree(utf8_p);
7561 if (UNICODE_IS_SUPER(c)
7562 && (flags & ANYOF_WARN_SUPER)
7563 && ckWARN_d(WARN_NON_UNICODE))
7565 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7566 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7570 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7571 return cBOOL(flags & ANYOF_INVERT) ^ match;
7575 S_reghop3(U8 *s, I32 off, const U8* lim)
7577 /* return the position 'off' UTF-8 characters away from 's', forward if
7578 * 'off' >= 0, backwards if negative. But don't go outside of position
7579 * 'lim', which better be < s if off < 0 */
7583 PERL_ARGS_ASSERT_REGHOP3;
7586 while (off-- && s < lim) {
7587 /* XXX could check well-formedness here */
7592 while (off++ && s > lim) {
7594 if (UTF8_IS_CONTINUED(*s)) {
7595 while (s > lim && UTF8_IS_CONTINUATION(*s))
7598 /* XXX could check well-formedness here */
7605 /* there are a bunch of places where we use two reghop3's that should
7606 be replaced with this routine. but since thats not done yet
7607 we ifdef it out - dmq
7610 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7614 PERL_ARGS_ASSERT_REGHOP4;
7617 while (off-- && s < rlim) {
7618 /* XXX could check well-formedness here */
7623 while (off++ && s > llim) {
7625 if (UTF8_IS_CONTINUED(*s)) {
7626 while (s > llim && UTF8_IS_CONTINUATION(*s))
7629 /* XXX could check well-formedness here */
7637 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7641 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7644 while (off-- && s < lim) {
7645 /* XXX could check well-formedness here */
7652 while (off++ && s > lim) {
7654 if (UTF8_IS_CONTINUED(*s)) {
7655 while (s > lim && UTF8_IS_CONTINUATION(*s))
7658 /* XXX could check well-formedness here */
7667 restore_pos(pTHX_ void *arg)
7670 regexp * const rex = (regexp *)arg;
7671 if (PL_reg_state.re_state_eval_setup_done) {
7672 if (PL_reg_oldsaved) {
7673 rex->subbeg = PL_reg_oldsaved;
7674 rex->sublen = PL_reg_oldsavedlen;
7675 rex->suboffset = PL_reg_oldsavedoffset;
7676 rex->subcoffset = PL_reg_oldsavedcoffset;
7678 rex->saved_copy = PL_nrs;
7680 RXp_MATCH_COPIED_on(rex);
7682 PL_reg_magic->mg_len = PL_reg_oldpos;
7683 PL_reg_state.re_state_eval_setup_done = FALSE;
7684 PL_curpm = PL_reg_oldcurpm;
7689 S_to_utf8_substr(pTHX_ regexp *prog)
7691 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7692 * on the converted value */
7696 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7699 if (prog->substrs->data[i].substr
7700 && !prog->substrs->data[i].utf8_substr) {
7701 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7702 prog->substrs->data[i].utf8_substr = sv;
7703 sv_utf8_upgrade(sv);
7704 if (SvVALID(prog->substrs->data[i].substr)) {
7705 if (SvTAIL(prog->substrs->data[i].substr)) {
7706 /* Trim the trailing \n that fbm_compile added last
7708 SvCUR_set(sv, SvCUR(sv) - 1);
7709 /* Whilst this makes the SV technically "invalid" (as its
7710 buffer is no longer followed by "\0") when fbm_compile()
7711 adds the "\n" back, a "\0" is restored. */
7712 fbm_compile(sv, FBMcf_TAIL);
7716 if (prog->substrs->data[i].substr == prog->check_substr)
7717 prog->check_utf8 = sv;
7723 S_to_byte_substr(pTHX_ regexp *prog)
7725 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7726 * on the converted value; returns FALSE if can't be converted. */
7731 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7734 if (prog->substrs->data[i].utf8_substr
7735 && !prog->substrs->data[i].substr) {
7736 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7737 if (! sv_utf8_downgrade(sv, TRUE)) {
7740 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7741 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7742 /* Trim the trailing \n that fbm_compile added last
7744 SvCUR_set(sv, SvCUR(sv) - 1);
7745 fbm_compile(sv, FBMcf_TAIL);
7749 prog->substrs->data[i].substr = sv;
7750 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7751 prog->check_substr = sv;
7760 * c-indentation-style: bsd
7762 * indent-tabs-mode: nil
7765 * ex: set ts=8 sts=4 sw=4 et: