5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
46 * pregcomp and pregexec -- regsub and regerror are not used in perl
48 * Copyright (c) 1986 by University of Toronto.
49 * Written by Henry Spencer. Not derived from licensed software.
51 * Permission is granted to anyone to use this software for any
52 * purpose on any computer system, and to redistribute it freely,
53 * subject to the following restrictions:
55 * 1. The author is not responsible for the consequences of use of
56 * this software, no matter how awful, even if they arise
59 * 2. The origin of this software must not be misrepresented, either
60 * by explicit claim or by omission.
62 * 3. Altered versions must be plainly marked as such, and must not
63 * be misrepresented as being the original software.
65 **** Alterations to Henry's code are...
67 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
68 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
69 **** by Larry Wall and others
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
82 #ifdef PERL_IN_XSUB_RE
88 #include "inline_invlist.c"
89 #include "unicode_constants.h"
91 #define RF_tainted 1 /* tainted information used? e.g. locale */
92 #define RF_warned 2 /* warned about big count? */
94 #define RF_utf8 8 /* Pattern contains multibyte chars? */
96 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
98 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
101 #define STATIC static
104 /* Valid for non-utf8 strings: avoids the reginclass
105 * call if there are no complications: i.e., if everything matchable is
106 * straight forward in the bitmap */
107 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
108 : ANYOF_BITMAP_TEST(p,*(c)))
114 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
115 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
117 #define HOPc(pos,off) \
118 (char *)(PL_reg_match_utf8 \
119 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
121 #define HOPBACKc(pos, off) \
122 (char*)(PL_reg_match_utf8\
123 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
124 : (pos - off >= PL_bostr) \
128 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
129 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
132 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
133 #define NEXTCHR_IS_EOS (nextchr < 0)
135 #define SET_nextchr \
136 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
138 #define SET_locinput(p) \
143 /* these are unrolled below in the CCC_TRY_XXX defined */
144 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
145 if (!CAT2(PL_utf8_,class)) { \
147 ENTER; save_re_context(); \
148 ok=CAT2(is_utf8_,class)((const U8*)str); \
149 PERL_UNUSED_VAR(ok); \
150 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
151 /* Doesn't do an assert to verify that is correct */
152 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
153 if (!CAT2(PL_utf8_,class)) { \
154 bool throw_away PERL_UNUSED_DECL; \
155 ENTER; save_re_context(); \
156 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
159 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
160 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
161 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
163 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
164 /* No asserts are done for some of these, in case called on a */ \
165 /* Unicode version in which they map to nothing */ \
166 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
167 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
169 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
171 /* The actual code for CCC_TRY, which uses several variables from the routine
172 * it's callable from. It is designed to be the bulk of a case statement.
173 * FUNC is the macro or function to call on non-utf8 targets that indicate if
174 * nextchr matches the class.
175 * UTF8_TEST is the whole test string to use for utf8 targets
176 * LOAD is what to use to test, and if not present to load in the swash for the
178 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
180 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
181 * utf8 and a variant, load the swash if necessary and test using the utf8
182 * test. Advance to the next character if test is ok, otherwise fail; If not
183 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
184 * fails, or advance to the next character */
186 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
187 if (NEXTCHR_IS_EOS) { \
190 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
191 LOAD_UTF8_CHARCLASS(CLASS, STR); \
192 if (POS_OR_NEG (UTF8_TEST)) { \
196 else if (POS_OR_NEG (FUNC(nextchr))) { \
199 goto increment_locinput;
201 /* Handle the non-locale cases for a character class and its complement. It
202 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
203 * This is because that code fails when the test succeeds, so we want to have
204 * the test fail so that the code succeeds. The swash is stored in a
205 * predictable PL_ place */
206 #define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
209 _CCC_TRY_CODE( !, FUNC, \
210 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
211 (U8*)locinput, TRUE)), \
214 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
215 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
216 (U8*)locinput, TRUE)), \
219 /* Generate the case statements for both locale and non-locale character
220 * classes in regmatch for classes that don't have special unicode semantics.
221 * Locales don't use an immediate swash, but an intermediary special locale
222 * function that is called on the pointer to the current place in the input
223 * string. That function will resolve to needing the same swash. One might
224 * think that because we don't know what the locale will match, we shouldn't
225 * check with the swash loading function that it loaded properly; ie, that we
226 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
227 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
229 #define CCC_TRY(NAME, NNAME, FUNC, \
230 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
231 NAMEA, NNAMEA, FUNCA, \
234 PL_reg_flags |= RF_tainted; \
235 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
237 PL_reg_flags |= RF_tainted; \
238 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
241 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
244 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
248 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
251 goto increment_locinput; \
252 /* Generate the non-locale cases */ \
253 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
255 /* This is like CCC_TRY, but has an extra set of parameters for generating case
256 * statements to handle separate Unicode semantics nodes */
257 #define CCC_TRY_U(NAME, NNAME, FUNC, \
258 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
259 NAMEU, NNAMEU, FUNCU, \
260 NAMEA, NNAMEA, FUNCA, \
262 CCC_TRY(NAME, NNAME, FUNC, \
263 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
264 NAMEA, NNAMEA, FUNCA, \
266 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
268 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
270 /* for use after a quantifier and before an EXACT-like node -- japhy */
271 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
273 * NOTE that *nothing* that affects backtracking should be in here, specifically
274 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
275 * node that is in between two EXACT like nodes when ascertaining what the required
276 * "follow" character is. This should probably be moved to regex compile time
277 * although it may be done at run time beause of the REF possibility - more
278 * investigation required. -- demerphq
280 #define JUMPABLE(rn) ( \
282 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
284 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
285 OP(rn) == PLUS || OP(rn) == MINMOD || \
287 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
289 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
291 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
294 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
295 we don't need this definition. */
296 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
297 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
298 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
301 /* ... so we use this as its faster. */
302 #define IS_TEXT(rn) ( OP(rn)==EXACT )
303 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
304 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
305 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
310 Search for mandatory following text node; for lookahead, the text must
311 follow but for lookbehind (rn->flags != 0) we skip to the next step.
313 #define FIND_NEXT_IMPT(rn) STMT_START { \
314 while (JUMPABLE(rn)) { \
315 const OPCODE type = OP(rn); \
316 if (type == SUSPEND || PL_regkind[type] == CURLY) \
317 rn = NEXTOPER(NEXTOPER(rn)); \
318 else if (type == PLUS) \
320 else if (type == IFMATCH) \
321 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
322 else rn += NEXT_OFF(rn); \
327 static void restore_pos(pTHX_ void *arg);
329 #define REGCP_PAREN_ELEMS 3
330 #define REGCP_OTHER_ELEMS 3
331 #define REGCP_FRAME_ELEMS 1
332 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
333 * are needed for the regexp context stack bookkeeping. */
336 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
339 const int retval = PL_savestack_ix;
340 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
341 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
342 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
344 GET_RE_DEBUG_FLAGS_DECL;
346 PERL_ARGS_ASSERT_REGCPPUSH;
348 if (paren_elems_to_push < 0)
349 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
350 paren_elems_to_push);
352 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
353 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
354 " out of range (%lu-%ld)",
355 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
357 SSGROW(total_elems + REGCP_FRAME_ELEMS);
360 if ((int)PL_regsize > (int)parenfloor)
361 PerlIO_printf(Perl_debug_log,
362 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
367 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
368 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
369 SSPUSHINT(rex->offs[p].end);
370 SSPUSHINT(rex->offs[p].start);
371 SSPUSHINT(rex->offs[p].start_tmp);
372 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
373 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
375 (IV)rex->offs[p].start,
376 (IV)rex->offs[p].start_tmp,
380 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
381 SSPUSHINT(PL_regsize);
382 SSPUSHINT(rex->lastparen);
383 SSPUSHINT(rex->lastcloseparen);
384 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
389 /* These are needed since we do not localize EVAL nodes: */
390 #define REGCP_SET(cp) \
392 PerlIO_printf(Perl_debug_log, \
393 " Setting an EVAL scope, savestack=%"IVdf"\n", \
394 (IV)PL_savestack_ix)); \
397 #define REGCP_UNWIND(cp) \
399 if (cp != PL_savestack_ix) \
400 PerlIO_printf(Perl_debug_log, \
401 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
402 (IV)(cp), (IV)PL_savestack_ix)); \
405 #define UNWIND_PAREN(lp, lcp) \
406 for (n = rex->lastparen; n > lp; n--) \
407 rex->offs[n].end = -1; \
408 rex->lastparen = n; \
409 rex->lastcloseparen = lcp;
413 S_regcppop(pTHX_ regexp *rex)
418 GET_RE_DEBUG_FLAGS_DECL;
420 PERL_ARGS_ASSERT_REGCPPOP;
422 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
424 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
425 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
426 rex->lastcloseparen = SSPOPINT;
427 rex->lastparen = SSPOPINT;
428 PL_regsize = SSPOPINT;
430 i -= REGCP_OTHER_ELEMS;
431 /* Now restore the parentheses context. */
433 if (i || rex->lastparen + 1 <= rex->nparens)
434 PerlIO_printf(Perl_debug_log,
435 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
441 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
443 rex->offs[paren].start_tmp = SSPOPINT;
444 rex->offs[paren].start = SSPOPINT;
446 if (paren <= rex->lastparen)
447 rex->offs[paren].end = tmps;
448 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
449 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
451 (IV)rex->offs[paren].start,
452 (IV)rex->offs[paren].start_tmp,
453 (IV)rex->offs[paren].end,
454 (paren > rex->lastparen ? "(skipped)" : ""));
459 /* It would seem that the similar code in regtry()
460 * already takes care of this, and in fact it is in
461 * a better location to since this code can #if 0-ed out
462 * but the code in regtry() is needed or otherwise tests
463 * requiring null fields (pat.t#187 and split.t#{13,14}
464 * (as of patchlevel 7877) will fail. Then again,
465 * this code seems to be necessary or otherwise
466 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
467 * --jhi updated by dapm */
468 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
470 rex->offs[i].start = -1;
471 rex->offs[i].end = -1;
472 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
473 " \\%"UVuf": %s ..-1 undeffing\n",
475 (i > PL_regsize) ? "-1" : " "
481 /* restore the parens and associated vars at savestack position ix,
482 * but without popping the stack */
485 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
487 I32 tmpix = PL_savestack_ix;
488 PL_savestack_ix = ix;
490 PL_savestack_ix = tmpix;
493 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
496 * pregexec and friends
499 #ifndef PERL_IN_XSUB_RE
501 - pregexec - match a regexp against a string
504 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
505 char *strbeg, I32 minend, SV *screamer, U32 nosave)
506 /* stringarg: the point in the string at which to begin matching */
507 /* strend: pointer to null at end of string */
508 /* strbeg: real beginning of string */
509 /* minend: end of match must be >= minend bytes after stringarg. */
510 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
511 * itself is accessed via the pointers above */
512 /* nosave: For optimizations. */
514 PERL_ARGS_ASSERT_PREGEXEC;
517 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
518 nosave ? 0 : REXEC_COPY_STR);
523 * Need to implement the following flags for reg_anch:
525 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
527 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
528 * INTUIT_AUTORITATIVE_ML
529 * INTUIT_ONCE_NOML - Intuit can match in one location only.
532 * Another flag for this function: SECOND_TIME (so that float substrs
533 * with giant delta may be not rechecked).
536 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
538 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
539 Otherwise, only SvCUR(sv) is used to get strbeg. */
541 /* XXXX We assume that strpos is strbeg unless sv. */
543 /* XXXX Some places assume that there is a fixed substring.
544 An update may be needed if optimizer marks as "INTUITable"
545 RExen without fixed substrings. Similarly, it is assumed that
546 lengths of all the strings are no more than minlen, thus they
547 cannot come from lookahead.
548 (Or minlen should take into account lookahead.)
549 NOTE: Some of this comment is not correct. minlen does now take account
550 of lookahead/behind. Further research is required. -- demerphq
554 /* A failure to find a constant substring means that there is no need to make
555 an expensive call to REx engine, thus we celebrate a failure. Similarly,
556 finding a substring too deep into the string means that less calls to
557 regtry() should be needed.
559 REx compiler's optimizer found 4 possible hints:
560 a) Anchored substring;
562 c) Whether we are anchored (beginning-of-line or \G);
563 d) First node (of those at offset 0) which may distinguish positions;
564 We use a)b)d) and multiline-part of c), and try to find a position in the
565 string which does not contradict any of them.
568 /* Most of decisions we do here should have been done at compile time.
569 The nodes of the REx which we used for the search should have been
570 deleted from the finite automaton. */
573 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
574 char *strend, const U32 flags, re_scream_pos_data *data)
577 struct regexp *const prog = (struct regexp *)SvANY(rx);
579 /* Should be nonnegative! */
585 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
587 char *other_last = NULL; /* other substr checked before this */
588 char *check_at = NULL; /* check substr found at this pos */
589 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
590 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
591 RXi_GET_DECL(prog,progi);
593 const char * const i_strpos = strpos;
595 GET_RE_DEBUG_FLAGS_DECL;
597 PERL_ARGS_ASSERT_RE_INTUIT_START;
598 PERL_UNUSED_ARG(flags);
599 PERL_UNUSED_ARG(data);
601 RX_MATCH_UTF8_set(rx,utf8_target);
604 PL_reg_flags |= RF_utf8;
607 debug_start_match(rx, utf8_target, strpos, strend,
608 sv ? "Guessing start of match in sv for"
609 : "Guessing start of match in string for");
612 /* CHR_DIST() would be more correct here but it makes things slow. */
613 if (prog->minlen > strend - strpos) {
614 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
615 "String too short... [re_intuit_start]\n"));
619 /* XXX we need to pass strbeg as a separate arg: the following is
620 * guesswork and can be wrong... */
621 if (sv && SvPOK(sv)) {
622 char * p = SvPVX(sv);
623 STRLEN cur = SvCUR(sv);
624 if (p <= strpos && strpos < p + cur) {
626 assert(p <= strend && strend <= p + cur);
629 strbeg = strend - cur;
636 if (!prog->check_utf8 && prog->check_substr)
637 to_utf8_substr(prog);
638 check = prog->check_utf8;
640 if (!prog->check_substr && prog->check_utf8) {
641 if (! to_byte_substr(prog)) {
642 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
643 non_utf8_target_but_utf8_required));
647 check = prog->check_substr;
649 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
650 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
651 || ( (prog->extflags & RXf_ANCH_BOL)
652 && !multiline ) ); /* Check after \n? */
655 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
656 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
657 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
659 && (strpos != strbeg)) {
660 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
663 if (prog->check_offset_min == prog->check_offset_max &&
664 !(prog->extflags & RXf_CANY_SEEN)) {
665 /* Substring at constant offset from beg-of-str... */
668 s = HOP3c(strpos, prog->check_offset_min, strend);
671 slen = SvCUR(check); /* >= 1 */
673 if ( strend - s > slen || strend - s < slen - 1
674 || (strend - s == slen && strend[-1] != '\n')) {
675 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
678 /* Now should match s[0..slen-2] */
680 if (slen && (*SvPVX_const(check) != *s
682 && memNE(SvPVX_const(check), s, slen)))) {
684 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
688 else if (*SvPVX_const(check) != *s
689 || ((slen = SvCUR(check)) > 1
690 && memNE(SvPVX_const(check), s, slen)))
693 goto success_at_start;
696 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
698 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
699 end_shift = prog->check_end_shift;
702 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
703 - (SvTAIL(check) != 0);
704 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
706 if (end_shift < eshift)
710 else { /* Can match at random position */
713 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
714 end_shift = prog->check_end_shift;
716 /* end shift should be non negative here */
719 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
721 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
722 (IV)end_shift, RX_PRECOMP(prog));
726 /* Find a possible match in the region s..strend by looking for
727 the "check" substring in the region corrected by start/end_shift. */
730 I32 srch_start_shift = start_shift;
731 I32 srch_end_shift = end_shift;
734 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
735 srch_end_shift -= ((strbeg - s) - srch_start_shift);
736 srch_start_shift = strbeg - s;
738 DEBUG_OPTIMISE_MORE_r({
739 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
740 (IV)prog->check_offset_min,
741 (IV)srch_start_shift,
743 (IV)prog->check_end_shift);
746 if (prog->extflags & RXf_CANY_SEEN) {
747 start_point= (U8*)(s + srch_start_shift);
748 end_point= (U8*)(strend - srch_end_shift);
750 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
751 end_point= HOP3(strend, -srch_end_shift, strbeg);
753 DEBUG_OPTIMISE_MORE_r({
754 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
755 (int)(end_point - start_point),
756 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
760 s = fbm_instr( start_point, end_point,
761 check, multiline ? FBMrf_MULTILINE : 0);
763 /* Update the count-of-usability, remove useless subpatterns,
767 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
768 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
769 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
770 (s ? "Found" : "Did not find"),
771 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
772 ? "anchored" : "floating"),
775 (s ? " at offset " : "...\n") );
780 /* Finish the diagnostic message */
781 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
783 /* XXX dmq: first branch is for positive lookbehind...
784 Our check string is offset from the beginning of the pattern.
785 So we need to do any stclass tests offset forward from that
794 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
795 Start with the other substr.
796 XXXX no SCREAM optimization yet - and a very coarse implementation
797 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
798 *always* match. Probably should be marked during compile...
799 Probably it is right to do no SCREAM here...
802 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
803 : (prog->float_substr && prog->anchored_substr))
805 /* Take into account the "other" substring. */
806 /* XXXX May be hopelessly wrong for UTF... */
809 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
812 char * const last = HOP3c(s, -start_shift, strbeg);
814 char * const saved_s = s;
817 t = s - prog->check_offset_max;
818 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
820 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
825 t = HOP3c(t, prog->anchored_offset, strend);
826 if (t < other_last) /* These positions already checked */
828 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
831 /* XXXX It is not documented what units *_offsets are in.
832 We assume bytes, but this is clearly wrong.
833 Meaning this code needs to be carefully reviewed for errors.
837 /* On end-of-str: see comment below. */
838 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
839 if (must == &PL_sv_undef) {
841 DEBUG_r(must = prog->anchored_utf8); /* for debug */
846 HOP3(HOP3(last1, prog->anchored_offset, strend)
847 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
849 multiline ? FBMrf_MULTILINE : 0
852 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
853 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
854 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
855 (s ? "Found" : "Contradicts"),
856 quoted, RE_SV_TAIL(must));
861 if (last1 >= last2) {
862 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
863 ", giving up...\n"));
866 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
867 ", trying floating at offset %ld...\n",
868 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
869 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
870 s = HOP3c(last, 1, strend);
874 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
875 (long)(s - i_strpos)));
876 t = HOP3c(s, -prog->anchored_offset, strbeg);
877 other_last = HOP3c(s, 1, strend);
885 else { /* Take into account the floating substring. */
887 char * const saved_s = s;
890 t = HOP3c(s, -start_shift, strbeg);
892 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
893 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
894 last = HOP3c(t, prog->float_max_offset, strend);
895 s = HOP3c(t, prog->float_min_offset, strend);
898 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
899 must = utf8_target ? prog->float_utf8 : prog->float_substr;
900 /* fbm_instr() takes into account exact value of end-of-str
901 if the check is SvTAIL(ed). Since false positives are OK,
902 and end-of-str is not later than strend we are OK. */
903 if (must == &PL_sv_undef) {
905 DEBUG_r(must = prog->float_utf8); /* for debug message */
908 s = fbm_instr((unsigned char*)s,
909 (unsigned char*)last + SvCUR(must)
911 must, multiline ? FBMrf_MULTILINE : 0);
913 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
914 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
915 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
916 (s ? "Found" : "Contradicts"),
917 quoted, RE_SV_TAIL(must));
921 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
922 ", giving up...\n"));
925 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
926 ", trying anchored starting at offset %ld...\n",
927 (long)(saved_s + 1 - i_strpos)));
929 s = HOP3c(t, 1, strend);
933 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
934 (long)(s - i_strpos)));
935 other_last = s; /* Fix this later. --Hugo */
945 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
947 DEBUG_OPTIMISE_MORE_r(
948 PerlIO_printf(Perl_debug_log,
949 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
950 (IV)prog->check_offset_min,
951 (IV)prog->check_offset_max,
959 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
961 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
964 /* Fixed substring is found far enough so that the match
965 cannot start at strpos. */
967 if (ml_anch && t[-1] != '\n') {
968 /* Eventually fbm_*() should handle this, but often
969 anchored_offset is not 0, so this check will not be wasted. */
970 /* XXXX In the code below we prefer to look for "^" even in
971 presence of anchored substrings. And we search even
972 beyond the found float position. These pessimizations
973 are historical artefacts only. */
975 while (t < strend - prog->minlen) {
977 if (t < check_at - prog->check_offset_min) {
978 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
979 /* Since we moved from the found position,
980 we definitely contradict the found anchored
981 substr. Due to the above check we do not
982 contradict "check" substr.
983 Thus we can arrive here only if check substr
984 is float. Redo checking for "other"=="fixed".
987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
988 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
989 goto do_other_anchored;
991 /* We don't contradict the found floating substring. */
992 /* XXXX Why not check for STCLASS? */
994 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
995 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
998 /* Position contradicts check-string */
999 /* XXXX probably better to look for check-string
1000 than for "\n", so one should lower the limit for t? */
1001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1002 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1003 other_last = strpos = s = t + 1;
1008 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1009 PL_colors[0], PL_colors[1]));
1013 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1014 PL_colors[0], PL_colors[1]));
1018 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1021 /* The found string does not prohibit matching at strpos,
1022 - no optimization of calling REx engine can be performed,
1023 unless it was an MBOL and we are not after MBOL,
1024 or a future STCLASS check will fail this. */
1026 /* Even in this situation we may use MBOL flag if strpos is offset
1027 wrt the start of the string. */
1028 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1029 && (strpos != strbeg) && strpos[-1] != '\n'
1030 /* May be due to an implicit anchor of m{.*foo} */
1031 && !(prog->intflags & PREGf_IMPLICIT))
1036 DEBUG_EXECUTE_r( if (ml_anch)
1037 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1038 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1041 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1043 prog->check_utf8 /* Could be deleted already */
1044 && --BmUSEFUL(prog->check_utf8) < 0
1045 && (prog->check_utf8 == prog->float_utf8)
1047 prog->check_substr /* Could be deleted already */
1048 && --BmUSEFUL(prog->check_substr) < 0
1049 && (prog->check_substr == prog->float_substr)
1052 /* If flags & SOMETHING - do not do it many times on the same match */
1053 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1054 /* XXX Does the destruction order has to change with utf8_target? */
1055 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1056 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1057 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1058 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1059 check = NULL; /* abort */
1061 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1062 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1063 if (prog->intflags & PREGf_IMPLICIT)
1064 prog->extflags &= ~RXf_ANCH_MBOL;
1065 /* XXXX This is a remnant of the old implementation. It
1066 looks wasteful, since now INTUIT can use many
1067 other heuristics. */
1068 prog->extflags &= ~RXf_USE_INTUIT;
1069 /* XXXX What other flags might need to be cleared in this branch? */
1075 /* Last resort... */
1076 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1077 /* trie stclasses are too expensive to use here, we are better off to
1078 leave it to regmatch itself */
1079 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1080 /* minlen == 0 is possible if regstclass is \b or \B,
1081 and the fixed substr is ''$.
1082 Since minlen is already taken into account, s+1 is before strend;
1083 accidentally, minlen >= 1 guaranties no false positives at s + 1
1084 even for \b or \B. But (minlen? 1 : 0) below assumes that
1085 regstclass does not come from lookahead... */
1086 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1087 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1088 const U8* const str = (U8*)STRING(progi->regstclass);
1089 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1090 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1093 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1094 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1095 else if (prog->float_substr || prog->float_utf8)
1096 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1100 if (checked_upto < s)
1102 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1103 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1106 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1111 const char *what = NULL;
1113 if (endpos == strend) {
1114 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1115 "Could not match STCLASS...\n") );
1118 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119 "This position contradicts STCLASS...\n") );
1120 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1122 checked_upto = HOPBACKc(endpos, start_shift);
1123 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1124 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1125 /* Contradict one of substrings */
1126 if (prog->anchored_substr || prog->anchored_utf8) {
1127 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1128 DEBUG_EXECUTE_r( what = "anchored" );
1130 s = HOP3c(t, 1, strend);
1131 if (s + start_shift + end_shift > strend) {
1132 /* XXXX Should be taken into account earlier? */
1133 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1134 "Could not match STCLASS...\n") );
1139 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1140 "Looking for %s substr starting at offset %ld...\n",
1141 what, (long)(s + start_shift - i_strpos)) );
1144 /* Have both, check_string is floating */
1145 if (t + start_shift >= check_at) /* Contradicts floating=check */
1146 goto retry_floating_check;
1147 /* Recheck anchored substring, but not floating... */
1151 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1152 "Looking for anchored substr starting at offset %ld...\n",
1153 (long)(other_last - i_strpos)) );
1154 goto do_other_anchored;
1156 /* Another way we could have checked stclass at the
1157 current position only: */
1162 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1163 "Looking for /%s^%s/m starting at offset %ld...\n",
1164 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1167 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1169 /* Check is floating substring. */
1170 retry_floating_check:
1171 t = check_at - start_shift;
1172 DEBUG_EXECUTE_r( what = "floating" );
1173 goto hop_and_restart;
1176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1177 "By STCLASS: moving %ld --> %ld\n",
1178 (long)(t - i_strpos), (long)(s - i_strpos))
1182 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1183 "Does not contradict STCLASS...\n");
1188 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1189 PL_colors[4], (check ? "Guessed" : "Giving up"),
1190 PL_colors[5], (long)(s - i_strpos)) );
1193 fail_finish: /* Substring not found */
1194 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1195 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1197 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1198 PL_colors[4], PL_colors[5]));
1202 #define DECL_TRIE_TYPE(scan) \
1203 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1204 trie_type = ((scan->flags == EXACT) \
1205 ? (utf8_target ? trie_utf8 : trie_plain) \
1206 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1208 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1209 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1211 switch (trie_type) { \
1212 case trie_utf8_fold: \
1213 if ( foldlen>0 ) { \
1214 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1219 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1220 len = UTF8SKIP(uc); \
1221 skiplen = UNISKIP( uvc ); \
1222 foldlen -= skiplen; \
1223 uscan = foldbuf + skiplen; \
1226 case trie_latin_utf8_fold: \
1227 if ( foldlen>0 ) { \
1228 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1234 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1235 skiplen = UNISKIP( uvc ); \
1236 foldlen -= skiplen; \
1237 uscan = foldbuf + skiplen; \
1241 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1248 charid = trie->charmap[ uvc ]; \
1252 if (widecharmap) { \
1253 SV** const svpp = hv_fetch(widecharmap, \
1254 (char*)&uvc, sizeof(UV), 0); \
1256 charid = (U16)SvIV(*svpp); \
1261 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1265 && (ln == 1 || folder(s, pat_string, ln)) \
1266 && (!reginfo || regtry(reginfo, &s)) ) \
1272 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1274 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
1280 #define REXEC_FBC_SCAN(CoDe) \
1282 while (s < strend) { \
1288 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1289 REXEC_FBC_UTF8_SCAN( \
1291 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1300 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1303 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1312 #define REXEC_FBC_TRYIT \
1313 if ((!reginfo || regtry(reginfo, &s))) \
1316 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1317 if (utf8_target) { \
1318 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1321 REXEC_FBC_CLASS_SCAN(CoNd); \
1324 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1325 if (utf8_target) { \
1327 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1330 REXEC_FBC_CLASS_SCAN(CoNd); \
1333 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1334 PL_reg_flags |= RF_tainted; \
1335 if (utf8_target) { \
1336 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1339 REXEC_FBC_CLASS_SCAN(CoNd); \
1342 #define DUMP_EXEC_POS(li,s,doutf8) \
1343 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1346 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1347 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1348 tmp = TEST_NON_UTF8(tmp); \
1349 REXEC_FBC_UTF8_SCAN( \
1350 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1359 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1360 if (s == PL_bostr) { \
1364 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1365 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1368 LOAD_UTF8_CHARCLASS_ALNUM(); \
1369 REXEC_FBC_UTF8_SCAN( \
1370 if (tmp == ! (TeSt2_UtF8)) { \
1379 /* The only difference between the BOUND and NBOUND cases is that
1380 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1381 * NBOUND. This is accomplished by passing it in either the if or else clause,
1382 * with the other one being empty */
1383 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1384 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1386 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1387 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1389 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1390 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1392 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1393 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1396 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1397 * be passed in completely with the variable name being tested, which isn't
1398 * such a clean interface, but this is easier to read than it was before. We
1399 * are looking for the boundary (or non-boundary between a word and non-word
1400 * character. The utf8 and non-utf8 cases have the same logic, but the details
1401 * must be different. Find the "wordness" of the character just prior to this
1402 * one, and compare it with the wordness of this one. If they differ, we have
1403 * a boundary. At the beginning of the string, pretend that the previous
1404 * character was a new-line */
1405 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1406 if (utf8_target) { \
1409 else { /* Not utf8 */ \
1410 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1411 tmp = TEST_NON_UTF8(tmp); \
1413 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1422 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1425 /* We know what class REx starts with. Try to find this position... */
1426 /* if reginfo is NULL, its a dryrun */
1427 /* annoyingly all the vars in this routine have different names from their counterparts
1428 in regmatch. /grrr */
1431 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1432 const char *strend, regmatch_info *reginfo)
1435 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1436 char *pat_string; /* The pattern's exactish string */
1437 char *pat_end; /* ptr to end char of pat_string */
1438 re_fold_t folder; /* Function for computing non-utf8 folds */
1439 const U8 *fold_array; /* array for folding ords < 256 */
1446 I32 tmp = 1; /* Scratch variable? */
1447 const bool utf8_target = PL_reg_match_utf8;
1448 UV utf8_fold_flags = 0;
1449 RXi_GET_DECL(prog,progi);
1451 PERL_ARGS_ASSERT_FIND_BYCLASS;
1453 /* We know what class it must start with. */
1457 STRLEN inclasslen = strend - s;
1458 REXEC_FBC_UTF8_CLASS_SCAN(
1459 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1462 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1467 if (tmp && (!reginfo || regtry(reginfo, &s)))
1475 if (UTF_PATTERN || utf8_target) {
1476 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1477 goto do_exactf_utf8;
1479 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1480 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1481 goto do_exactf_non_utf8; /* isn't dealt with by these */
1486 /* regcomp.c already folded this if pattern is in UTF-8 */
1487 utf8_fold_flags = 0;
1488 goto do_exactf_utf8;
1490 fold_array = PL_fold;
1492 goto do_exactf_non_utf8;
1495 if (UTF_PATTERN || utf8_target) {
1496 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1497 goto do_exactf_utf8;
1499 fold_array = PL_fold_locale;
1500 folder = foldEQ_locale;
1501 goto do_exactf_non_utf8;
1505 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1507 goto do_exactf_utf8;
1509 case EXACTFU_TRICKYFOLD:
1511 if (UTF_PATTERN || utf8_target) {
1512 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1513 goto do_exactf_utf8;
1516 /* Any 'ss' in the pattern should have been replaced by regcomp,
1517 * so we don't have to worry here about this single special case
1518 * in the Latin1 range */
1519 fold_array = PL_fold_latin1;
1520 folder = foldEQ_latin1;
1524 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1525 are no glitches with fold-length differences
1526 between the target string and pattern */
1528 /* The idea in the non-utf8 EXACTF* cases is to first find the
1529 * first character of the EXACTF* node and then, if necessary,
1530 * case-insensitively compare the full text of the node. c1 is the
1531 * first character. c2 is its fold. This logic will not work for
1532 * Unicode semantics and the german sharp ss, which hence should
1533 * not be compiled into a node that gets here. */
1534 pat_string = STRING(c);
1535 ln = STR_LEN(c); /* length to match in octets/bytes */
1537 /* We know that we have to match at least 'ln' bytes (which is the
1538 * same as characters, since not utf8). If we have to match 3
1539 * characters, and there are only 2 availabe, we know without
1540 * trying that it will fail; so don't start a match past the
1541 * required minimum number from the far end */
1542 e = HOP3c(strend, -((I32)ln), s);
1544 if (!reginfo && e < s) {
1545 e = s; /* Due to minlen logic of intuit() */
1549 c2 = fold_array[c1];
1550 if (c1 == c2) { /* If char and fold are the same */
1551 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1554 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1563 /* If one of the operands is in utf8, we can't use the simpler
1564 * folding above, due to the fact that many different characters
1565 * can have the same fold, or portion of a fold, or different-
1567 pat_string = STRING(c);
1568 ln = STR_LEN(c); /* length to match in octets/bytes */
1569 pat_end = pat_string + ln;
1570 lnc = (UTF_PATTERN) /* length to match in characters */
1571 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1574 /* We have 'lnc' characters to match in the pattern, but because of
1575 * multi-character folding, each character in the target can match
1576 * up to 3 characters (Unicode guarantees it will never exceed
1577 * this) if it is utf8-encoded; and up to 2 if not (based on the
1578 * fact that the Latin 1 folds are already determined, and the
1579 * only multi-char fold in that range is the sharp-s folding to
1580 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1581 * string character. Adjust lnc accordingly, rounding up, so that
1582 * if we need to match at least 4+1/3 chars, that really is 5. */
1583 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1584 lnc = (lnc + expansion - 1) / expansion;
1586 /* As in the non-UTF8 case, if we have to match 3 characters, and
1587 * only 2 are left, it's guaranteed to fail, so don't start a
1588 * match that would require us to go beyond the end of the string
1590 e = HOP3c(strend, -((I32)lnc), s);
1592 if (!reginfo && e < s) {
1593 e = s; /* Due to minlen logic of intuit() */
1596 /* XXX Note that we could recalculate e to stop the loop earlier,
1597 * as the worst case expansion above will rarely be met, and as we
1598 * go along we would usually find that e moves further to the left.
1599 * This would happen only after we reached the point in the loop
1600 * where if there were no expansion we should fail. Unclear if
1601 * worth the expense */
1604 char *my_strend= (char *)strend;
1605 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1606 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1607 && (!reginfo || regtry(reginfo, &s)) )
1611 s += (utf8_target) ? UTF8SKIP(s) : 1;
1616 PL_reg_flags |= RF_tainted;
1617 FBC_BOUND(isALNUM_LC,
1618 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1619 isALNUM_LC_utf8((U8*)s));
1622 PL_reg_flags |= RF_tainted;
1623 FBC_NBOUND(isALNUM_LC,
1624 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625 isALNUM_LC_utf8((U8*)s));
1628 FBC_BOUND(isWORDCHAR,
1630 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1633 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1635 isWORDCHAR_A((U8*)s));
1638 FBC_NBOUND(isWORDCHAR,
1640 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1643 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1645 isWORDCHAR_A((U8*)s));
1648 FBC_BOUND(isWORDCHAR_L1,
1650 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1653 FBC_NBOUND(isWORDCHAR_L1,
1655 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1658 REXEC_FBC_CSCAN_TAINT(
1659 isALNUM_LC_utf8((U8*)s),
1664 REXEC_FBC_CSCAN_PRELOAD(
1665 LOAD_UTF8_CHARCLASS_ALNUM(),
1666 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1667 isWORDCHAR_L1((U8) *s)
1671 REXEC_FBC_CSCAN_PRELOAD(
1672 LOAD_UTF8_CHARCLASS_ALNUM(),
1673 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1678 /* Don't need to worry about utf8, as it can match only a single
1679 * byte invariant character */
1680 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1683 REXEC_FBC_CSCAN_PRELOAD(
1684 LOAD_UTF8_CHARCLASS_ALNUM(),
1685 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1686 ! isWORDCHAR_L1((U8) *s)
1690 REXEC_FBC_CSCAN_PRELOAD(
1691 LOAD_UTF8_CHARCLASS_ALNUM(),
1692 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1703 REXEC_FBC_CSCAN_TAINT(
1704 !isALNUM_LC_utf8((U8*)s),
1709 REXEC_FBC_CSCAN_PRELOAD(
1710 LOAD_UTF8_CHARCLASS_SPACE(),
1711 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1716 REXEC_FBC_CSCAN_PRELOAD(
1717 LOAD_UTF8_CHARCLASS_SPACE(),
1718 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1723 /* Don't need to worry about utf8, as it can match only a single
1724 * byte invariant character */
1725 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1728 REXEC_FBC_CSCAN_TAINT(
1729 isSPACE_LC_utf8((U8*)s),
1734 REXEC_FBC_CSCAN_PRELOAD(
1735 LOAD_UTF8_CHARCLASS_SPACE(),
1736 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1737 ! isSPACE_L1((U8) *s)
1741 REXEC_FBC_CSCAN_PRELOAD(
1742 LOAD_UTF8_CHARCLASS_SPACE(),
1743 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1754 REXEC_FBC_CSCAN_TAINT(
1755 !isSPACE_LC_utf8((U8*)s),
1760 REXEC_FBC_CSCAN_PRELOAD(
1761 LOAD_UTF8_CHARCLASS_DIGIT(),
1762 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1767 /* Don't need to worry about utf8, as it can match only a single
1768 * byte invariant character */
1769 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1772 REXEC_FBC_CSCAN_TAINT(
1773 isDIGIT_LC_utf8((U8*)s),
1778 REXEC_FBC_CSCAN_PRELOAD(
1779 LOAD_UTF8_CHARCLASS_DIGIT(),
1780 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1791 REXEC_FBC_CSCAN_TAINT(
1792 !isDIGIT_LC_utf8((U8*)s),
1798 is_LNBREAK_utf8_safe(s, strend),
1799 is_LNBREAK_latin1_safe(s, strend)
1804 is_VERTWS_utf8_safe(s, strend),
1805 is_VERTWS_latin1_safe(s, strend)
1810 !is_VERTWS_utf8_safe(s, strend),
1811 !is_VERTWS_latin1_safe(s, strend)
1816 is_HORIZWS_utf8_safe(s, strend),
1817 is_HORIZWS_latin1_safe(s, strend)
1822 !is_HORIZWS_utf8_safe(s, strend),
1823 !is_HORIZWS_latin1_safe(s, strend)
1827 /* Don't need to worry about utf8, as it can match only a single
1828 * byte invariant character. The flag in this node type is the
1829 * class number to pass to _generic_isCC() to build a mask for
1830 * searching in PL_charclass[] */
1831 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1835 !_generic_isCC_A(*s, FLAGS(c)),
1836 !_generic_isCC_A(*s, FLAGS(c))
1844 /* what trie are we using right now */
1846 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1848 = (reg_trie_data*)progi->data->data[ aho->trie ];
1849 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1851 const char *last_start = strend - trie->minlen;
1853 const char *real_start = s;
1855 STRLEN maxlen = trie->maxlen;
1857 U8 **points; /* map of where we were in the input string
1858 when reading a given char. For ASCII this
1859 is unnecessary overhead as the relationship
1860 is always 1:1, but for Unicode, especially
1861 case folded Unicode this is not true. */
1862 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1866 GET_RE_DEBUG_FLAGS_DECL;
1868 /* We can't just allocate points here. We need to wrap it in
1869 * an SV so it gets freed properly if there is a croak while
1870 * running the match */
1873 sv_points=newSV(maxlen * sizeof(U8 *));
1874 SvCUR_set(sv_points,
1875 maxlen * sizeof(U8 *));
1876 SvPOK_on(sv_points);
1877 sv_2mortal(sv_points);
1878 points=(U8**)SvPV_nolen(sv_points );
1879 if ( trie_type != trie_utf8_fold
1880 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1883 bitmap=(U8*)trie->bitmap;
1885 bitmap=(U8*)ANYOF_BITMAP(c);
1887 /* this is the Aho-Corasick algorithm modified a touch
1888 to include special handling for long "unknown char"
1889 sequences. The basic idea being that we use AC as long
1890 as we are dealing with a possible matching char, when
1891 we encounter an unknown char (and we have not encountered
1892 an accepting state) we scan forward until we find a legal
1894 AC matching is basically that of trie matching, except
1895 that when we encounter a failing transition, we fall back
1896 to the current states "fail state", and try the current char
1897 again, a process we repeat until we reach the root state,
1898 state 1, or a legal transition. If we fail on the root state
1899 then we can either terminate if we have reached an accepting
1900 state previously, or restart the entire process from the beginning
1904 while (s <= last_start) {
1905 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1913 U8 *uscan = (U8*)NULL;
1914 U8 *leftmost = NULL;
1916 U32 accepted_word= 0;
1920 while ( state && uc <= (U8*)strend ) {
1922 U32 word = aho->states[ state ].wordnum;
1926 DEBUG_TRIE_EXECUTE_r(
1927 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1928 dump_exec_pos( (char *)uc, c, strend, real_start,
1929 (char *)uc, utf8_target );
1930 PerlIO_printf( Perl_debug_log,
1931 " Scanning for legal start char...\n");
1935 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1939 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1945 if (uc >(U8*)last_start) break;
1949 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1950 if (!leftmost || lpos < leftmost) {
1951 DEBUG_r(accepted_word=word);
1957 points[pointpos++ % maxlen]= uc;
1958 if (foldlen || uc < (U8*)strend) {
1959 REXEC_TRIE_READ_CHAR(trie_type, trie,
1961 uscan, len, uvc, charid, foldlen,
1963 DEBUG_TRIE_EXECUTE_r({
1964 dump_exec_pos( (char *)uc, c, strend,
1965 real_start, s, utf8_target);
1966 PerlIO_printf(Perl_debug_log,
1967 " Charid:%3u CP:%4"UVxf" ",
1979 word = aho->states[ state ].wordnum;
1981 base = aho->states[ state ].trans.base;
1983 DEBUG_TRIE_EXECUTE_r({
1985 dump_exec_pos( (char *)uc, c, strend, real_start,
1987 PerlIO_printf( Perl_debug_log,
1988 "%sState: %4"UVxf", word=%"UVxf,
1989 failed ? " Fail transition to " : "",
1990 (UV)state, (UV)word);
1996 ( ((offset = base + charid
1997 - 1 - trie->uniquecharcount)) >= 0)
1998 && ((U32)offset < trie->lasttrans)
1999 && trie->trans[offset].check == state
2000 && (tmp=trie->trans[offset].next))
2002 DEBUG_TRIE_EXECUTE_r(
2003 PerlIO_printf( Perl_debug_log," - legal\n"));
2008 DEBUG_TRIE_EXECUTE_r(
2009 PerlIO_printf( Perl_debug_log," - fail\n"));
2011 state = aho->fail[state];
2015 /* we must be accepting here */
2016 DEBUG_TRIE_EXECUTE_r(
2017 PerlIO_printf( Perl_debug_log," - accepting\n"));
2026 if (!state) state = 1;
2029 if ( aho->states[ state ].wordnum ) {
2030 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2031 if (!leftmost || lpos < leftmost) {
2032 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2037 s = (char*)leftmost;
2038 DEBUG_TRIE_EXECUTE_r({
2040 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2041 (UV)accepted_word, (IV)(s - real_start)
2044 if (!reginfo || regtry(reginfo, &s)) {
2050 DEBUG_TRIE_EXECUTE_r({
2051 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2054 DEBUG_TRIE_EXECUTE_r(
2055 PerlIO_printf( Perl_debug_log,"No match.\n"));
2064 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2074 - regexec_flags - match a regexp against a string
2077 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2078 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2079 /* stringarg: the point in the string at which to begin matching */
2080 /* strend: pointer to null at end of string */
2081 /* strbeg: real beginning of string */
2082 /* minend: end of match must be >= minend bytes after stringarg. */
2083 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2084 * itself is accessed via the pointers above */
2085 /* data: May be used for some additional optimizations.
2086 Currently its only used, with a U32 cast, for transmitting
2087 the ganch offset when doing a /g match. This will change */
2088 /* nosave: For optimizations. */
2092 struct regexp *const prog = (struct regexp *)SvANY(rx);
2093 /*register*/ char *s;
2095 /*register*/ char *startpos = stringarg;
2096 I32 minlen; /* must match at least this many chars */
2097 I32 dontbother = 0; /* how many characters not to try at end */
2098 I32 end_shift = 0; /* Same for the end. */ /* CC */
2099 I32 scream_pos = -1; /* Internal iterator of scream. */
2100 char *scream_olds = NULL;
2101 const bool utf8_target = cBOOL(DO_UTF8(sv));
2103 RXi_GET_DECL(prog,progi);
2104 regmatch_info reginfo; /* create some info to pass to regtry etc */
2105 regexp_paren_pair *swap = NULL;
2106 GET_RE_DEBUG_FLAGS_DECL;
2108 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2109 PERL_UNUSED_ARG(data);
2111 /* Be paranoid... */
2112 if (prog == NULL || startpos == NULL) {
2113 Perl_croak(aTHX_ "NULL regexp parameter");
2117 multiline = prog->extflags & RXf_PMf_MULTILINE;
2118 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2120 RX_MATCH_UTF8_set(rx, utf8_target);
2122 debug_start_match(rx, utf8_target, startpos, strend,
2126 minlen = prog->minlen;
2128 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2129 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2130 "String too short [regexec_flags]...\n"));
2135 /* Check validity of program. */
2136 if (UCHARAT(progi->program) != REG_MAGIC) {
2137 Perl_croak(aTHX_ "corrupted regexp program");
2141 PL_reg_state.re_state_eval_setup_done = FALSE;
2145 PL_reg_flags |= RF_utf8;
2147 /* Mark beginning of line for ^ and lookbehind. */
2148 reginfo.bol = startpos; /* XXX not used ??? */
2152 /* Mark end of line for $ (and such) */
2155 /* see how far we have to get to not match where we matched before */
2156 reginfo.till = startpos+minend;
2158 /* If there is a "must appear" string, look for it. */
2161 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2163 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2164 reginfo.ganch = startpos + prog->gofs;
2165 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2166 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2167 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2169 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2170 && mg->mg_len >= 0) {
2171 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2172 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2173 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2175 if (prog->extflags & RXf_ANCH_GPOS) {
2176 if (s > reginfo.ganch)
2178 s = reginfo.ganch - prog->gofs;
2179 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2180 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2186 reginfo.ganch = strbeg + PTR2UV(data);
2187 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2188 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2190 } else { /* pos() not defined */
2191 reginfo.ganch = strbeg;
2192 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2193 "GPOS: reginfo.ganch = strbeg\n"));
2196 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2197 /* We have to be careful. If the previous successful match
2198 was from this regex we don't want a subsequent partially
2199 successful match to clobber the old results.
2200 So when we detect this possibility we add a swap buffer
2201 to the re, and switch the buffer each match. If we fail
2202 we switch it back, otherwise we leave it swapped.
2205 /* do we need a save destructor here for eval dies? */
2206 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2207 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2208 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2214 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2215 re_scream_pos_data d;
2217 d.scream_olds = &scream_olds;
2218 d.scream_pos = &scream_pos;
2219 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2221 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2222 goto phooey; /* not present */
2228 /* Simplest case: anchored match need be tried only once. */
2229 /* [unless only anchor is BOL and multiline is set] */
2230 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2231 if (s == startpos && regtry(®info, &startpos))
2233 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2234 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2239 dontbother = minlen - 1;
2240 end = HOP3c(strend, -dontbother, strbeg) - 1;
2241 /* for multiline we only have to try after newlines */
2242 if (prog->check_substr || prog->check_utf8) {
2243 /* because of the goto we can not easily reuse the macros for bifurcating the
2244 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2247 goto after_try_utf8;
2249 if (regtry(®info, &s)) {
2256 if (prog->extflags & RXf_USE_INTUIT) {
2257 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2266 } /* end search for check string in unicode */
2268 if (s == startpos) {
2269 goto after_try_latin;
2272 if (regtry(®info, &s)) {
2279 if (prog->extflags & RXf_USE_INTUIT) {
2280 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2289 } /* end search for check string in latin*/
2290 } /* end search for check string */
2291 else { /* search for newline */
2293 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2296 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2297 while (s <= end) { /* note it could be possible to match at the end of the string */
2298 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2299 if (regtry(®info, &s))
2303 } /* end search for newline */
2304 } /* end anchored/multiline check string search */
2306 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2308 /* the warning about reginfo.ganch being used without initialization
2309 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2310 and we only enter this block when the same bit is set. */
2311 char *tmp_s = reginfo.ganch - prog->gofs;
2313 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2318 /* Messy cases: unanchored match. */
2319 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2320 /* we have /x+whatever/ */
2321 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2327 if (! prog->anchored_utf8) {
2328 to_utf8_substr(prog);
2330 ch = SvPVX_const(prog->anchored_utf8)[0];
2333 DEBUG_EXECUTE_r( did_match = 1 );
2334 if (regtry(®info, &s)) goto got_it;
2336 while (s < strend && *s == ch)
2343 if (! prog->anchored_substr) {
2344 if (! to_byte_substr(prog)) {
2345 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2346 non_utf8_target_but_utf8_required));
2350 ch = SvPVX_const(prog->anchored_substr)[0];
2353 DEBUG_EXECUTE_r( did_match = 1 );
2354 if (regtry(®info, &s)) goto got_it;
2356 while (s < strend && *s == ch)
2361 DEBUG_EXECUTE_r(if (!did_match)
2362 PerlIO_printf(Perl_debug_log,
2363 "Did not find anchored character...\n")
2366 else if (prog->anchored_substr != NULL
2367 || prog->anchored_utf8 != NULL
2368 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2369 && prog->float_max_offset < strend - s)) {
2374 char *last1; /* Last position checked before */
2378 if (prog->anchored_substr || prog->anchored_utf8) {
2380 if (! prog->anchored_utf8) {
2381 to_utf8_substr(prog);
2383 must = prog->anchored_utf8;
2386 if (! prog->anchored_substr) {
2387 if (! to_byte_substr(prog)) {
2388 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2389 non_utf8_target_but_utf8_required));
2393 must = prog->anchored_substr;
2395 back_max = back_min = prog->anchored_offset;
2398 if (! prog->float_utf8) {
2399 to_utf8_substr(prog);
2401 must = prog->float_utf8;
2404 if (! prog->float_substr) {
2405 if (! to_byte_substr(prog)) {
2406 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2407 non_utf8_target_but_utf8_required));
2411 must = prog->float_substr;
2413 back_max = prog->float_max_offset;
2414 back_min = prog->float_min_offset;
2420 last = HOP3c(strend, /* Cannot start after this */
2421 -(I32)(CHR_SVLEN(must)
2422 - (SvTAIL(must) != 0) + back_min), strbeg);
2425 last1 = HOPc(s, -1);
2427 last1 = s - 1; /* bogus */
2429 /* XXXX check_substr already used to find "s", can optimize if
2430 check_substr==must. */
2432 dontbother = end_shift;
2433 strend = HOPc(strend, -dontbother);
2434 while ( (s <= last) &&
2435 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2436 (unsigned char*)strend, must,
2437 multiline ? FBMrf_MULTILINE : 0)) ) {
2438 DEBUG_EXECUTE_r( did_match = 1 );
2439 if (HOPc(s, -back_max) > last1) {
2440 last1 = HOPc(s, -back_min);
2441 s = HOPc(s, -back_max);
2444 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2446 last1 = HOPc(s, -back_min);
2450 while (s <= last1) {
2451 if (regtry(®info, &s))
2454 s++; /* to break out of outer loop */
2461 while (s <= last1) {
2462 if (regtry(®info, &s))
2468 DEBUG_EXECUTE_r(if (!did_match) {
2469 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2470 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2471 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2472 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2473 ? "anchored" : "floating"),
2474 quoted, RE_SV_TAIL(must));
2478 else if ( (c = progi->regstclass) ) {
2480 const OPCODE op = OP(progi->regstclass);
2481 /* don't bother with what can't match */
2482 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2483 strend = HOPc(strend, -(minlen - 1));
2486 SV * const prop = sv_newmortal();
2487 regprop(prog, prop, c);
2489 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2491 PerlIO_printf(Perl_debug_log,
2492 "Matching stclass %.*s against %s (%d bytes)\n",
2493 (int)SvCUR(prop), SvPVX_const(prop),
2494 quoted, (int)(strend - s));
2497 if (find_byclass(prog, c, s, strend, ®info))
2499 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2503 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2511 if (! prog->float_utf8) {
2512 to_utf8_substr(prog);
2514 float_real = prog->float_utf8;
2517 if (! prog->float_substr) {
2518 if (! to_byte_substr(prog)) {
2519 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2520 non_utf8_target_but_utf8_required));
2524 float_real = prog->float_substr;
2527 little = SvPV_const(float_real, len);
2528 if (SvTAIL(float_real)) {
2529 /* This means that float_real contains an artificial \n on
2530 * the end due to the presence of something like this:
2531 * /foo$/ where we can match both "foo" and "foo\n" at the
2532 * end of the string. So we have to compare the end of the
2533 * string first against the float_real without the \n and
2534 * then against the full float_real with the string. We
2535 * have to watch out for cases where the string might be
2536 * smaller than the float_real or the float_real without
2538 char *checkpos= strend - len;
2540 PerlIO_printf(Perl_debug_log,
2541 "%sChecking for float_real.%s\n",
2542 PL_colors[4], PL_colors[5]));
2543 if (checkpos + 1 < strbeg) {
2544 /* can't match, even if we remove the trailing \n
2545 * string is too short to match */
2547 PerlIO_printf(Perl_debug_log,
2548 "%sString shorter than required trailing substring, cannot match.%s\n",
2549 PL_colors[4], PL_colors[5]));
2551 } else if (memEQ(checkpos + 1, little, len - 1)) {
2552 /* can match, the end of the string matches without the
2554 last = checkpos + 1;
2555 } else if (checkpos < strbeg) {
2556 /* cant match, string is too short when the "\n" is
2559 PerlIO_printf(Perl_debug_log,
2560 "%sString does not contain required trailing substring, cannot match.%s\n",
2561 PL_colors[4], PL_colors[5]));
2563 } else if (!multiline) {
2564 /* non multiline match, so compare with the "\n" at the
2565 * end of the string */
2566 if (memEQ(checkpos, little, len)) {
2570 PerlIO_printf(Perl_debug_log,
2571 "%sString does not contain required trailing substring, cannot match.%s\n",
2572 PL_colors[4], PL_colors[5]));
2576 /* multiline match, so we have to search for a place
2577 * where the full string is located */
2583 last = rninstr(s, strend, little, little + len);
2585 last = strend; /* matching "$" */
2588 /* at one point this block contained a comment which was
2589 * probably incorrect, which said that this was a "should not
2590 * happen" case. Even if it was true when it was written I am
2591 * pretty sure it is not anymore, so I have removed the comment
2592 * and replaced it with this one. Yves */
2594 PerlIO_printf(Perl_debug_log,
2595 "String does not contain required substring, cannot match.\n"
2599 dontbother = strend - last + prog->float_min_offset;
2601 if (minlen && (dontbother < minlen))
2602 dontbother = minlen - 1;
2603 strend -= dontbother; /* this one's always in bytes! */
2604 /* We don't know much -- general case. */
2607 if (regtry(®info, &s))
2616 if (regtry(®info, &s))
2618 } while (s++ < strend);
2628 PerlIO_printf(Perl_debug_log,
2629 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2635 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2637 if (PL_reg_state.re_state_eval_setup_done)
2638 restore_pos(aTHX_ prog);
2639 if (RXp_PAREN_NAMES(prog))
2640 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2642 /* make sure $`, $&, $', and $digit will work later */
2643 if ( !(flags & REXEC_NOT_FIRST) ) {
2644 if (flags & REXEC_COPY_STR) {
2645 #ifdef PERL_OLD_COPY_ON_WRITE
2647 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2649 PerlIO_printf(Perl_debug_log,
2650 "Copy on write: regexp capture, type %d\n",
2653 RX_MATCH_COPY_FREE(rx);
2654 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2655 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2656 assert (SvPOKp(prog->saved_copy));
2657 prog->sublen = PL_regeol - strbeg;
2658 prog->suboffset = 0;
2659 prog->subcoffset = 0;
2664 I32 max = PL_regeol - strbeg;
2667 if ( (flags & REXEC_COPY_SKIP_POST)
2668 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2669 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2670 ) { /* don't copy $' part of string */
2673 /* calculate the right-most part of the string covered
2674 * by a capture. Due to look-ahead, this may be to
2675 * the right of $&, so we have to scan all captures */
2676 while (n <= prog->lastparen) {
2677 if (prog->offs[n].end > max)
2678 max = prog->offs[n].end;
2682 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2683 ? prog->offs[0].start
2685 assert(max >= 0 && max <= PL_regeol - strbeg);
2688 if ( (flags & REXEC_COPY_SKIP_PRE)
2689 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2690 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2691 ) { /* don't copy $` part of string */
2694 /* calculate the left-most part of the string covered
2695 * by a capture. Due to look-behind, this may be to
2696 * the left of $&, so we have to scan all captures */
2697 while (min && n <= prog->lastparen) {
2698 if ( prog->offs[n].start != -1
2699 && prog->offs[n].start < min)
2701 min = prog->offs[n].start;
2705 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2706 && min > prog->offs[0].end
2708 min = prog->offs[0].end;
2712 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2715 if (RX_MATCH_COPIED(rx)) {
2716 if (sublen > prog->sublen)
2718 (char*)saferealloc(prog->subbeg, sublen+1);
2721 prog->subbeg = (char*)safemalloc(sublen+1);
2722 Copy(strbeg + min, prog->subbeg, sublen, char);
2723 prog->subbeg[sublen] = '\0';
2724 prog->suboffset = min;
2725 prog->sublen = sublen;
2726 RX_MATCH_COPIED_on(rx);
2728 prog->subcoffset = prog->suboffset;
2729 if (prog->suboffset && utf8_target) {
2730 /* Convert byte offset to chars.
2731 * XXX ideally should only compute this if @-/@+
2732 * has been seen, a la PL_sawampersand ??? */
2734 /* If there's a direct correspondence between the
2735 * string which we're matching and the original SV,
2736 * then we can use the utf8 len cache associated with
2737 * the SV. In particular, it means that under //g,
2738 * sv_pos_b2u() will use the previously cached
2739 * position to speed up working out the new length of
2740 * subcoffset, rather than counting from the start of
2741 * the string each time. This stops
2742 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2743 * from going quadratic */
2744 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2745 sv_pos_b2u(sv, &(prog->subcoffset));
2747 prog->subcoffset = utf8_length((U8*)strbeg,
2748 (U8*)(strbeg+prog->suboffset));
2752 RX_MATCH_COPY_FREE(rx);
2753 prog->subbeg = strbeg;
2754 prog->suboffset = 0;
2755 prog->subcoffset = 0;
2756 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2763 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2764 PL_colors[4], PL_colors[5]));
2765 if (PL_reg_state.re_state_eval_setup_done)
2766 restore_pos(aTHX_ prog);
2768 /* we failed :-( roll it back */
2769 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2770 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2775 Safefree(prog->offs);
2782 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2783 * Do inc before dec, in case old and new rex are the same */
2784 #define SET_reg_curpm(Re2) \
2785 if (PL_reg_state.re_state_eval_setup_done) { \
2786 (void)ReREFCNT_inc(Re2); \
2787 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2788 PM_SETRE((PL_reg_curpm), (Re2)); \
2793 - regtry - try match at specific point
2795 STATIC I32 /* 0 failure, 1 success */
2796 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2800 REGEXP *const rx = reginfo->prog;
2801 regexp *const prog = (struct regexp *)SvANY(rx);
2803 RXi_GET_DECL(prog,progi);
2804 GET_RE_DEBUG_FLAGS_DECL;
2806 PERL_ARGS_ASSERT_REGTRY;
2808 reginfo->cutpoint=NULL;
2810 if ((prog->extflags & RXf_EVAL_SEEN)
2811 && !PL_reg_state.re_state_eval_setup_done)
2815 PL_reg_state.re_state_eval_setup_done = TRUE;
2817 /* Make $_ available to executed code. */
2818 if (reginfo->sv != DEFSV) {
2820 DEFSV_set(reginfo->sv);
2823 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2824 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2825 /* prepare for quick setting of pos */
2826 #ifdef PERL_OLD_COPY_ON_WRITE
2827 if (SvIsCOW(reginfo->sv))
2828 sv_force_normal_flags(reginfo->sv, 0);
2830 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2831 &PL_vtbl_mglob, NULL, 0);
2835 PL_reg_oldpos = mg->mg_len;
2836 SAVEDESTRUCTOR_X(restore_pos, prog);
2838 if (!PL_reg_curpm) {
2839 Newxz(PL_reg_curpm, 1, PMOP);
2842 SV* const repointer = &PL_sv_undef;
2843 /* this regexp is also owned by the new PL_reg_curpm, which
2844 will try to free it. */
2845 av_push(PL_regex_padav, repointer);
2846 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2847 PL_regex_pad = AvARRAY(PL_regex_padav);
2852 PL_reg_oldcurpm = PL_curpm;
2853 PL_curpm = PL_reg_curpm;
2854 if (RXp_MATCH_COPIED(prog)) {
2855 /* Here is a serious problem: we cannot rewrite subbeg,
2856 since it may be needed if this match fails. Thus
2857 $` inside (?{}) could fail... */
2858 PL_reg_oldsaved = prog->subbeg;
2859 PL_reg_oldsavedlen = prog->sublen;
2860 PL_reg_oldsavedoffset = prog->suboffset;
2861 PL_reg_oldsavedcoffset = prog->suboffset;
2862 #ifdef PERL_OLD_COPY_ON_WRITE
2863 PL_nrs = prog->saved_copy;
2865 RXp_MATCH_COPIED_off(prog);
2868 PL_reg_oldsaved = NULL;
2869 prog->subbeg = PL_bostr;
2870 prog->suboffset = 0;
2871 prog->subcoffset = 0;
2872 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2875 PL_reg_starttry = *startposp;
2877 prog->offs[0].start = *startposp - PL_bostr;
2878 prog->lastparen = 0;
2879 prog->lastcloseparen = 0;
2882 /* XXXX What this code is doing here?!!! There should be no need
2883 to do this again and again, prog->lastparen should take care of
2886 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2887 * Actually, the code in regcppop() (which Ilya may be meaning by
2888 * prog->lastparen), is not needed at all by the test suite
2889 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2890 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2891 * Meanwhile, this code *is* needed for the
2892 * above-mentioned test suite tests to succeed. The common theme
2893 * on those tests seems to be returning null fields from matches.
2894 * --jhi updated by dapm */
2896 if (prog->nparens) {
2897 regexp_paren_pair *pp = prog->offs;
2899 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2907 result = regmatch(reginfo, *startposp, progi->program + 1);
2909 prog->offs[0].end = result;
2912 if (reginfo->cutpoint)
2913 *startposp= reginfo->cutpoint;
2914 REGCP_UNWIND(lastcp);
2919 #define sayYES goto yes
2920 #define sayNO goto no
2921 #define sayNO_SILENT goto no_silent
2923 /* we dont use STMT_START/END here because it leads to
2924 "unreachable code" warnings, which are bogus, but distracting. */
2925 #define CACHEsayNO \
2926 if (ST.cache_mask) \
2927 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2930 /* this is used to determine how far from the left messages like
2931 'failed...' are printed. It should be set such that messages
2932 are inline with the regop output that created them.
2934 #define REPORT_CODE_OFF 32
2937 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2938 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2939 #define CHRTEST_NOT_A_CP_1 -999
2940 #define CHRTEST_NOT_A_CP_2 -998
2942 #define SLAB_FIRST(s) (&(s)->states[0])
2943 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2945 /* grab a new slab and return the first slot in it */
2947 STATIC regmatch_state *
2950 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2953 regmatch_slab *s = PL_regmatch_slab->next;
2955 Newx(s, 1, regmatch_slab);
2956 s->prev = PL_regmatch_slab;
2958 PL_regmatch_slab->next = s;
2960 PL_regmatch_slab = s;
2961 return SLAB_FIRST(s);
2965 /* push a new state then goto it */
2967 #define PUSH_STATE_GOTO(state, node, input) \
2968 pushinput = input; \
2970 st->resume_state = state; \
2973 /* push a new state with success backtracking, then goto it */
2975 #define PUSH_YES_STATE_GOTO(state, node, input) \
2976 pushinput = input; \
2978 st->resume_state = state; \
2979 goto push_yes_state;
2986 regmatch() - main matching routine
2988 This is basically one big switch statement in a loop. We execute an op,
2989 set 'next' to point the next op, and continue. If we come to a point which
2990 we may need to backtrack to on failure such as (A|B|C), we push a
2991 backtrack state onto the backtrack stack. On failure, we pop the top
2992 state, and re-enter the loop at the state indicated. If there are no more
2993 states to pop, we return failure.
2995 Sometimes we also need to backtrack on success; for example /A+/, where
2996 after successfully matching one A, we need to go back and try to
2997 match another one; similarly for lookahead assertions: if the assertion
2998 completes successfully, we backtrack to the state just before the assertion
2999 and then carry on. In these cases, the pushed state is marked as
3000 'backtrack on success too'. This marking is in fact done by a chain of
3001 pointers, each pointing to the previous 'yes' state. On success, we pop to
3002 the nearest yes state, discarding any intermediate failure-only states.
3003 Sometimes a yes state is pushed just to force some cleanup code to be
3004 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3005 it to free the inner regex.
3007 Note that failure backtracking rewinds the cursor position, while
3008 success backtracking leaves it alone.
3010 A pattern is complete when the END op is executed, while a subpattern
3011 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3012 ops trigger the "pop to last yes state if any, otherwise return true"
3015 A common convention in this function is to use A and B to refer to the two
3016 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3017 the subpattern to be matched possibly multiple times, while B is the entire
3018 rest of the pattern. Variable and state names reflect this convention.
3020 The states in the main switch are the union of ops and failure/success of
3021 substates associated with with that op. For example, IFMATCH is the op
3022 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3023 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3024 successfully matched A and IFMATCH_A_fail is a state saying that we have
3025 just failed to match A. Resume states always come in pairs. The backtrack
3026 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3027 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3028 on success or failure.
3030 The struct that holds a backtracking state is actually a big union, with
3031 one variant for each major type of op. The variable st points to the
3032 top-most backtrack struct. To make the code clearer, within each
3033 block of code we #define ST to alias the relevant union.
3035 Here's a concrete example of a (vastly oversimplified) IFMATCH
3041 #define ST st->u.ifmatch
3043 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3044 ST.foo = ...; // some state we wish to save
3046 // push a yes backtrack state with a resume value of
3047 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3049 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3052 case IFMATCH_A: // we have successfully executed A; now continue with B
3054 bar = ST.foo; // do something with the preserved value
3057 case IFMATCH_A_fail: // A failed, so the assertion failed
3058 ...; // do some housekeeping, then ...
3059 sayNO; // propagate the failure
3066 For any old-timers reading this who are familiar with the old recursive
3067 approach, the code above is equivalent to:
3069 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3078 ...; // do some housekeeping, then ...
3079 sayNO; // propagate the failure
3082 The topmost backtrack state, pointed to by st, is usually free. If you
3083 want to claim it, populate any ST.foo fields in it with values you wish to
3084 save, then do one of
3086 PUSH_STATE_GOTO(resume_state, node, newinput);
3087 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3089 which sets that backtrack state's resume value to 'resume_state', pushes a
3090 new free entry to the top of the backtrack stack, then goes to 'node'.
3091 On backtracking, the free slot is popped, and the saved state becomes the
3092 new free state. An ST.foo field in this new top state can be temporarily
3093 accessed to retrieve values, but once the main loop is re-entered, it
3094 becomes available for reuse.
3096 Note that the depth of the backtrack stack constantly increases during the
3097 left-to-right execution of the pattern, rather than going up and down with
3098 the pattern nesting. For example the stack is at its maximum at Z at the
3099 end of the pattern, rather than at X in the following:
3101 /(((X)+)+)+....(Y)+....Z/
3103 The only exceptions to this are lookahead/behind assertions and the cut,
3104 (?>A), which pop all the backtrack states associated with A before
3107 Backtrack state structs are allocated in slabs of about 4K in size.
3108 PL_regmatch_state and st always point to the currently active state,
3109 and PL_regmatch_slab points to the slab currently containing
3110 PL_regmatch_state. The first time regmatch() is called, the first slab is
3111 allocated, and is never freed until interpreter destruction. When the slab
3112 is full, a new one is allocated and chained to the end. At exit from
3113 regmatch(), slabs allocated since entry are freed.
3118 #define DEBUG_STATE_pp(pp) \
3120 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3121 PerlIO_printf(Perl_debug_log, \
3122 " %*s"pp" %s%s%s%s%s\n", \
3124 PL_reg_name[st->resume_state], \
3125 ((st==yes_state||st==mark_state) ? "[" : ""), \
3126 ((st==yes_state) ? "Y" : ""), \
3127 ((st==mark_state) ? "M" : ""), \
3128 ((st==yes_state||st==mark_state) ? "]" : "") \
3133 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3138 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3139 const char *start, const char *end, const char *blurb)
3141 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3143 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3148 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3149 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3151 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3152 start, end - start, 60);
3154 PerlIO_printf(Perl_debug_log,
3155 "%s%s REx%s %s against %s\n",
3156 PL_colors[4], blurb, PL_colors[5], s0, s1);
3158 if (utf8_target||utf8_pat)
3159 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3160 utf8_pat ? "pattern" : "",
3161 utf8_pat && utf8_target ? " and " : "",
3162 utf8_target ? "string" : ""
3168 S_dump_exec_pos(pTHX_ const char *locinput,
3169 const regnode *scan,
3170 const char *loc_regeol,
3171 const char *loc_bostr,
3172 const char *loc_reg_starttry,
3173 const bool utf8_target)
3175 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3176 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3177 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3178 /* The part of the string before starttry has one color
3179 (pref0_len chars), between starttry and current
3180 position another one (pref_len - pref0_len chars),
3181 after the current position the third one.
3182 We assume that pref0_len <= pref_len, otherwise we
3183 decrease pref0_len. */
3184 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3185 ? (5 + taill) - l : locinput - loc_bostr;
3188 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3190 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3192 pref0_len = pref_len - (locinput - loc_reg_starttry);
3193 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3194 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3195 ? (5 + taill) - pref_len : loc_regeol - locinput);
3196 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3200 if (pref0_len > pref_len)
3201 pref0_len = pref_len;
3203 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3205 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3206 (locinput - pref_len),pref0_len, 60, 4, 5);
3208 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3209 (locinput - pref_len + pref0_len),
3210 pref_len - pref0_len, 60, 2, 3);
3212 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3213 locinput, loc_regeol - locinput, 10, 0, 1);
3215 const STRLEN tlen=len0+len1+len2;
3216 PerlIO_printf(Perl_debug_log,
3217 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3218 (IV)(locinput - loc_bostr),
3221 (docolor ? "" : "> <"),
3223 (int)(tlen > 19 ? 0 : 19 - tlen),
3230 /* reg_check_named_buff_matched()
3231 * Checks to see if a named buffer has matched. The data array of
3232 * buffer numbers corresponding to the buffer is expected to reside
3233 * in the regexp->data->data array in the slot stored in the ARG() of
3234 * node involved. Note that this routine doesn't actually care about the
3235 * name, that information is not preserved from compilation to execution.
3236 * Returns the index of the leftmost defined buffer with the given name
3237 * or 0 if non of the buffers matched.
3240 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3243 RXi_GET_DECL(rex,rexi);
3244 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3245 I32 *nums=(I32*)SvPVX(sv_dat);
3247 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3249 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3250 if ((I32)rex->lastparen >= nums[n] &&
3251 rex->offs[nums[n]].end != -1)
3260 /* free all slabs above current one - called during LEAVE_SCOPE */
3263 S_clear_backtrack_stack(pTHX_ void *p)
3265 regmatch_slab *s = PL_regmatch_slab->next;
3270 PL_regmatch_slab->next = NULL;
3272 regmatch_slab * const osl = s;
3278 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3280 /* This function determines if there are one or two characters that match
3281 * the first character of the passed-in EXACTish node <text_node>, and if
3282 * so, returns them in the passed-in pointers.
3284 * If it determines that no possible character in the target string can
3285 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3286 * the first character in <text_node> requires UTF-8 to represent, and the
3287 * target string isn't in UTF-8.)
3289 * If there are more than two characters that could match the beginning of
3290 * <text_node>, or if more context is required to determine a match or not,
3291 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3293 * The motiviation behind this function is to allow the caller to set up
3294 * tight loops for matching. If <text_node> is of type EXACT, there is
3295 * only one possible character that can match its first character, and so
3296 * the situation is quite simple. But things get much more complicated if
3297 * folding is involved. It may be that the first character of an EXACTFish
3298 * node doesn't participate in any possible fold, e.g., punctuation, so it
3299 * can be matched only by itself. The vast majority of characters that are
3300 * in folds match just two things, their lower and upper-case equivalents.
3301 * But not all are like that; some have multiple possible matches, or match
3302 * sequences of more than one character. This function sorts all that out.
3304 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3305 * loop of trying to match A*, we know we can't exit where the thing
3306 * following it isn't a B. And something can't be a B unless it is the
3307 * beginning of B. By putting a quick test for that beginning in a tight
3308 * loop, we can rule out things that can't possibly be B without having to
3309 * break out of the loop, thus avoiding work. Similarly, if A is a single
3310 * character, we can make a tight loop matching A*, using the outputs of
3313 * If the target string to match isn't in UTF-8, and there aren't
3314 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3315 * the one or two possible octets (which are characters in this situation)
3316 * that can match. In all cases, if there is only one character that can
3317 * match, *<c1p> and *<c2p> will be identical.
3319 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3320 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3321 * can match the beginning of <text_node>. They should be declared with at
3322 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3323 * undefined what these contain.) If one or both of the buffers are
3324 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3325 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3326 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3327 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3328 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3330 const bool utf8_target = PL_reg_match_utf8;
3333 bool use_chrtest_void = FALSE;
3335 /* Used when we have both utf8 input and utf8 output, to avoid converting
3336 * to/from code points */
3337 bool utf8_has_been_setup = FALSE;
3341 U8 *pat = (U8*)STRING(text_node);
3343 if (OP(text_node) == EXACT) {
3345 /* In an exact node, only one thing can be matched, that first
3346 * character. If both the pat and the target are UTF-8, we can just
3347 * copy the input to the output, avoiding finding the code point of
3349 if (! UTF_PATTERN) {
3352 else if (utf8_target) {
3353 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3354 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3355 utf8_has_been_setup = TRUE;
3358 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3361 else /* an EXACTFish node */
3363 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3364 pat + STR_LEN(text_node)))
3366 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3367 pat + STR_LEN(text_node))))
3369 /* Multi-character folds require more context to sort out. Also
3370 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3371 * handled outside this routine */
3372 use_chrtest_void = TRUE;
3374 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3375 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3377 /* Load the folds hash, if not already done */
3379 if (! PL_utf8_foldclosures) {
3380 if (! PL_utf8_tofold) {
3381 U8 dummy[UTF8_MAXBYTES+1];
3383 /* Force loading this by folding an above-Latin1 char */
3384 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3385 assert(PL_utf8_tofold); /* Verify that worked */
3387 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3390 /* The fold closures data structure is a hash with the keys being
3391 * the UTF-8 of every character that is folded to, like 'k', and
3392 * the values each an array of all code points that fold to its
3393 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3395 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3400 /* Not found in the hash, therefore there are no folds
3401 * containing it, so there is only a single character that
3405 else { /* Does participate in folds */
3406 AV* list = (AV*) *listp;
3407 if (av_len(list) != 1) {
3409 /* If there aren't exactly two folds to this, it is outside
3410 * the scope of this function */
3411 use_chrtest_void = TRUE;
3413 else { /* There are two. Get them */
3414 SV** c_p = av_fetch(list, 0, FALSE);
3416 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3420 c_p = av_fetch(list, 1, FALSE);
3422 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3426 /* Folds that cross the 255/256 boundary are forbidden if
3427 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3428 * pattern character is above 256, and its only other match
3429 * is below 256, the only legal match will be to itself.
3430 * We have thrown away the original, so have to compute
3431 * which is the one above 255 */
3432 if ((c1 < 256) != (c2 < 256)) {
3433 if (OP(text_node) == EXACTFL
3434 || (OP(text_node) == EXACTFA
3435 && (isASCII(c1) || isASCII(c2))))
3448 else /* Here, c1 is < 255 */
3450 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3451 && OP(text_node) != EXACTFL
3452 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3454 /* Here, there could be something above Latin1 in the target which
3455 * folds to this character in the pattern. All such cases except
3456 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3457 * involved in their folds, so are outside the scope of this
3459 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3460 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3463 use_chrtest_void = TRUE;
3466 else { /* Here nothing above Latin1 can fold to the pattern character */
3467 switch (OP(text_node)) {
3469 case EXACTFL: /* /l rules */
3470 c2 = PL_fold_locale[c1];
3474 if (! utf8_target) { /* /d rules */
3479 /* /u rules for all these. This happens to work for
3480 * EXACTFA as nothing in Latin1 folds to ASCII */
3482 case EXACTFU_TRICKYFOLD:
3485 c2 = PL_fold_latin1[c1];
3488 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3493 /* Here have figured things out. Set up the returns */
3494 if (use_chrtest_void) {
3495 *c2p = *c1p = CHRTEST_VOID;
3497 else if (utf8_target) {
3498 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3499 uvchr_to_utf8(c1_utf8, c1);
3500 uvchr_to_utf8(c2_utf8, c2);
3503 /* Invariants are stored in both the utf8 and byte outputs; Use
3504 * negative numbers otherwise for the byte ones. Make sure that the
3505 * byte ones are the same iff the utf8 ones are the same */
3506 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3507 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3510 ? CHRTEST_NOT_A_CP_1
3511 : CHRTEST_NOT_A_CP_2;
3513 else if (c1 > 255) {
3514 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3519 *c1p = *c2p = c2; /* c2 is the only representable value */
3521 else { /* c1 is representable; see about c2 */
3523 *c2p = (c2 < 256) ? c2 : c1;
3529 /* returns -1 on failure, $+[0] on success */
3531 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3533 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3537 const bool utf8_target = PL_reg_match_utf8;
3538 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3539 REGEXP *rex_sv = reginfo->prog;
3540 regexp *rex = (struct regexp *)SvANY(rex_sv);
3541 RXi_GET_DECL(rex,rexi);
3543 /* the current state. This is a cached copy of PL_regmatch_state */
3545 /* cache heavy used fields of st in registers */
3548 U32 n = 0; /* general value; init to avoid compiler warning */
3549 I32 ln = 0; /* len or last; init to avoid compiler warning */
3550 char *locinput = startpos;
3551 char *pushinput; /* where to continue after a PUSH */
3552 I32 nextchr; /* is always set to UCHARAT(locinput) */
3554 bool result = 0; /* return value of S_regmatch */
3555 int depth = 0; /* depth of backtrack stack */
3556 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3557 const U32 max_nochange_depth =
3558 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3559 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3560 regmatch_state *yes_state = NULL; /* state to pop to on success of
3562 /* mark_state piggy backs on the yes_state logic so that when we unwind
3563 the stack on success we can update the mark_state as we go */
3564 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3565 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3566 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3568 bool no_final = 0; /* prevent failure from backtracking? */
3569 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3570 char *startpoint = locinput;
3571 SV *popmark = NULL; /* are we looking for a mark? */
3572 SV *sv_commit = NULL; /* last mark name seen in failure */
3573 SV *sv_yes_mark = NULL; /* last mark name we have seen
3574 during a successful match */
3575 U32 lastopen = 0; /* last open we saw */
3576 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3577 SV* const oreplsv = GvSV(PL_replgv);
3578 /* these three flags are set by various ops to signal information to
3579 * the very next op. They have a useful lifetime of exactly one loop
3580 * iteration, and are not preserved or restored by state pushes/pops
3582 bool sw = 0; /* the condition value in (?(cond)a|b) */
3583 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3584 int logical = 0; /* the following EVAL is:
3588 or the following IFMATCH/UNLESSM is:
3589 false: plain (?=foo)
3590 true: used as a condition: (?(?=foo))
3592 PAD* last_pad = NULL;
3594 I32 gimme = G_SCALAR;
3595 CV *caller_cv = NULL; /* who called us */
3596 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3597 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3600 GET_RE_DEBUG_FLAGS_DECL;
3603 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3604 multicall_oldcatch = 0;
3605 multicall_cv = NULL;
3607 PERL_UNUSED_VAR(multicall_cop);
3608 PERL_UNUSED_VAR(newsp);
3611 PERL_ARGS_ASSERT_REGMATCH;
3613 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3614 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3616 /* on first ever call to regmatch, allocate first slab */
3617 if (!PL_regmatch_slab) {
3618 Newx(PL_regmatch_slab, 1, regmatch_slab);
3619 PL_regmatch_slab->prev = NULL;
3620 PL_regmatch_slab->next = NULL;
3621 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3624 oldsave = PL_savestack_ix;
3625 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3626 SAVEVPTR(PL_regmatch_slab);
3627 SAVEVPTR(PL_regmatch_state);
3629 /* grab next free state slot */
3630 st = ++PL_regmatch_state;
3631 if (st > SLAB_LAST(PL_regmatch_slab))
3632 st = PL_regmatch_state = S_push_slab(aTHX);
3634 /* Note that nextchr is a byte even in UTF */
3637 while (scan != NULL) {
3640 SV * const prop = sv_newmortal();
3641 regnode *rnext=regnext(scan);
3642 DUMP_EXEC_POS( locinput, scan, utf8_target );
3643 regprop(rex, prop, scan);
3645 PerlIO_printf(Perl_debug_log,
3646 "%3"IVdf":%*s%s(%"IVdf")\n",
3647 (IV)(scan - rexi->program), depth*2, "",
3649 (PL_regkind[OP(scan)] == END || !rnext) ?
3650 0 : (IV)(rnext - rexi->program));
3653 next = scan + NEXT_OFF(scan);
3656 state_num = OP(scan);
3661 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3663 switch (state_num) {
3664 case BOL: /* /^../ */
3665 if (locinput == PL_bostr)
3667 /* reginfo->till = reginfo->bol; */
3672 case MBOL: /* /^../m */
3673 if (locinput == PL_bostr ||
3674 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3680 case SBOL: /* /^../s */
3681 if (locinput == PL_bostr)
3686 if (locinput == reginfo->ganch)
3690 case KEEPS: /* \K */
3691 /* update the startpoint */
3692 st->u.keeper.val = rex->offs[0].start;
3693 rex->offs[0].start = locinput - PL_bostr;
3694 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3696 case KEEPS_next_fail:
3697 /* rollback the start point change */
3698 rex->offs[0].start = st->u.keeper.val;
3702 case EOL: /* /..$/ */
3705 case MEOL: /* /..$/m */
3706 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3710 case SEOL: /* /..$/s */
3712 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3714 if (PL_regeol - locinput > 1)
3719 if (!NEXTCHR_IS_EOS)
3723 case SANY: /* /./s */
3726 goto increment_locinput;
3734 case REG_ANY: /* /./ */
3735 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3737 goto increment_locinput;
3741 #define ST st->u.trie
3742 case TRIEC: /* (ab|cd) with known charclass */
3743 /* In this case the charclass data is available inline so
3744 we can fail fast without a lot of extra overhead.
3746 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3748 PerlIO_printf(Perl_debug_log,
3749 "%*s %sfailed to match trie start class...%s\n",
3750 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3753 assert(0); /* NOTREACHED */
3756 case TRIE: /* (ab|cd) */
3757 /* the basic plan of execution of the trie is:
3758 * At the beginning, run though all the states, and
3759 * find the longest-matching word. Also remember the position
3760 * of the shortest matching word. For example, this pattern:
3763 * when matched against the string "abcde", will generate
3764 * accept states for all words except 3, with the longest
3765 * matching word being 4, and the shortest being 2 (with
3766 * the position being after char 1 of the string).
3768 * Then for each matching word, in word order (i.e. 1,2,4,5),
3769 * we run the remainder of the pattern; on each try setting
3770 * the current position to the character following the word,
3771 * returning to try the next word on failure.
3773 * We avoid having to build a list of words at runtime by
3774 * using a compile-time structure, wordinfo[].prev, which
3775 * gives, for each word, the previous accepting word (if any).
3776 * In the case above it would contain the mappings 1->2, 2->0,
3777 * 3->0, 4->5, 5->1. We can use this table to generate, from
3778 * the longest word (4 above), a list of all words, by
3779 * following the list of prev pointers; this gives us the
3780 * unordered list 4,5,1,2. Then given the current word we have
3781 * just tried, we can go through the list and find the
3782 * next-biggest word to try (so if we just failed on word 2,
3783 * the next in the list is 4).
3785 * Since at runtime we don't record the matching position in
3786 * the string for each word, we have to work that out for
3787 * each word we're about to process. The wordinfo table holds
3788 * the character length of each word; given that we recorded
3789 * at the start: the position of the shortest word and its
3790 * length in chars, we just need to move the pointer the
3791 * difference between the two char lengths. Depending on
3792 * Unicode status and folding, that's cheap or expensive.
3794 * This algorithm is optimised for the case where are only a
3795 * small number of accept states, i.e. 0,1, or maybe 2.
3796 * With lots of accepts states, and having to try all of them,
3797 * it becomes quadratic on number of accept states to find all
3802 /* what type of TRIE am I? (utf8 makes this contextual) */
3803 DECL_TRIE_TYPE(scan);
3805 /* what trie are we using right now */
3806 reg_trie_data * const trie
3807 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3808 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3809 U32 state = trie->startstate;
3812 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3814 if (trie->states[ state ].wordnum) {
3816 PerlIO_printf(Perl_debug_log,
3817 "%*s %smatched empty string...%s\n",
3818 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3824 PerlIO_printf(Perl_debug_log,
3825 "%*s %sfailed to match trie start class...%s\n",
3826 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3833 U8 *uc = ( U8* )locinput;
3837 U8 *uscan = (U8*)NULL;
3838 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3839 U32 charcount = 0; /* how many input chars we have matched */
3840 U32 accepted = 0; /* have we seen any accepting states? */
3842 ST.jump = trie->jump;
3845 ST.longfold = FALSE; /* char longer if folded => it's harder */
3848 /* fully traverse the TRIE; note the position of the
3849 shortest accept state and the wordnum of the longest
3852 while ( state && uc <= (U8*)PL_regeol ) {
3853 U32 base = trie->states[ state ].trans.base;
3857 wordnum = trie->states[ state ].wordnum;
3859 if (wordnum) { /* it's an accept state */
3862 /* record first match position */
3864 ST.firstpos = (U8*)locinput;
3869 ST.firstchars = charcount;
3872 if (!ST.nextword || wordnum < ST.nextword)
3873 ST.nextword = wordnum;
3874 ST.topword = wordnum;
3877 DEBUG_TRIE_EXECUTE_r({
3878 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3879 PerlIO_printf( Perl_debug_log,
3880 "%*s %sState: %4"UVxf" Accepted: %c ",
3881 2+depth * 2, "", PL_colors[4],
3882 (UV)state, (accepted ? 'Y' : 'N'));
3885 /* read a char and goto next state */
3886 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3888 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3889 uscan, len, uvc, charid, foldlen,
3896 base + charid - 1 - trie->uniquecharcount)) >= 0)
3898 && ((U32)offset < trie->lasttrans)
3899 && trie->trans[offset].check == state)
3901 state = trie->trans[offset].next;
3912 DEBUG_TRIE_EXECUTE_r(
3913 PerlIO_printf( Perl_debug_log,
3914 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3915 charid, uvc, (UV)state, PL_colors[5] );
3921 /* calculate total number of accept states */
3926 w = trie->wordinfo[w].prev;
3929 ST.accepted = accepted;
3933 PerlIO_printf( Perl_debug_log,
3934 "%*s %sgot %"IVdf" possible matches%s\n",
3935 REPORT_CODE_OFF + depth * 2, "",
3936 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3938 goto trie_first_try; /* jump into the fail handler */
3940 assert(0); /* NOTREACHED */
3942 case TRIE_next_fail: /* we failed - try next alternative */
3946 REGCP_UNWIND(ST.cp);
3947 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3949 if (!--ST.accepted) {
3951 PerlIO_printf( Perl_debug_log,
3952 "%*s %sTRIE failed...%s\n",
3953 REPORT_CODE_OFF+depth*2, "",
3960 /* Find next-highest word to process. Note that this code
3961 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3964 U16 const nextword = ST.nextword;
3965 reg_trie_wordinfo * const wordinfo
3966 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3967 for (word=ST.topword; word; word=wordinfo[word].prev) {
3968 if (word > nextword && (!min || word < min))
3981 ST.lastparen = rex->lastparen;
3982 ST.lastcloseparen = rex->lastcloseparen;
3986 /* find start char of end of current word */
3988 U32 chars; /* how many chars to skip */
3989 reg_trie_data * const trie
3990 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3992 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3994 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3999 /* the hard option - fold each char in turn and find
4000 * its folded length (which may be different */
4001 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4009 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4017 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4022 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4038 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4039 ? ST.jump[ST.nextword]
4043 PerlIO_printf( Perl_debug_log,
4044 "%*s %sTRIE matched word #%d, continuing%s\n",
4045 REPORT_CODE_OFF+depth*2, "",
4052 if (ST.accepted > 1 || has_cutgroup) {
4053 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4054 assert(0); /* NOTREACHED */
4056 /* only one choice left - just continue */
4058 AV *const trie_words
4059 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4060 SV ** const tmp = av_fetch( trie_words,
4062 SV *sv= tmp ? sv_newmortal() : NULL;
4064 PerlIO_printf( Perl_debug_log,
4065 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4066 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4068 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4069 PL_colors[0], PL_colors[1],
4070 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4072 : "not compiled under -Dr",
4076 locinput = (char*)uc;
4077 continue; /* execute rest of RE */
4078 assert(0); /* NOTREACHED */
4082 case EXACT: { /* /abc/ */
4083 char *s = STRING(scan);
4085 if (utf8_target != UTF_PATTERN) {
4086 /* The target and the pattern have differing utf8ness. */
4088 const char * const e = s + ln;
4091 /* The target is utf8, the pattern is not utf8.
4092 * Above-Latin1 code points can't match the pattern;
4093 * invariants match exactly, and the other Latin1 ones need
4094 * to be downgraded to a single byte in order to do the
4095 * comparison. (If we could be confident that the target
4096 * is not malformed, this could be refactored to have fewer
4097 * tests by just assuming that if the first bytes match, it
4098 * is an invariant, but there are tests in the test suite
4099 * dealing with (??{...}) which violate this) */
4103 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4106 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4113 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4122 /* The target is not utf8, the pattern is utf8. */
4124 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4128 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4135 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4146 /* The target and the pattern have the same utf8ness. */
4147 /* Inline the first character, for speed. */
4148 if (UCHARAT(s) != nextchr)
4150 if (PL_regeol - locinput < ln)
4152 if (ln > 1 && memNE(s, locinput, ln))
4158 case EXACTFL: { /* /abc/il */
4160 const U8 * fold_array;
4162 U32 fold_utf8_flags;
4164 PL_reg_flags |= RF_tainted;
4165 folder = foldEQ_locale;
4166 fold_array = PL_fold_locale;
4167 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4170 case EXACTFU_SS: /* /\x{df}/iu */
4171 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4172 case EXACTFU: /* /abc/iu */
4173 folder = foldEQ_latin1;
4174 fold_array = PL_fold_latin1;
4175 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4178 case EXACTFA: /* /abc/iaa */
4179 folder = foldEQ_latin1;
4180 fold_array = PL_fold_latin1;
4181 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4184 case EXACTF: /* /abc/i */
4186 fold_array = PL_fold;
4187 fold_utf8_flags = 0;
4193 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4194 /* Either target or the pattern are utf8, or has the issue where
4195 * the fold lengths may differ. */
4196 const char * const l = locinput;
4197 char *e = PL_regeol;
4199 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
4200 l, &e, 0, utf8_target, fold_utf8_flags))
4208 /* Neither the target nor the pattern are utf8 */
4209 if (UCHARAT(s) != nextchr
4211 && UCHARAT(s) != fold_array[nextchr])
4215 if (PL_regeol - locinput < ln)
4217 if (ln > 1 && ! folder(s, locinput, ln))
4223 /* XXX Could improve efficiency by separating these all out using a
4224 * macro or in-line function. At that point regcomp.c would no longer
4225 * have to set the FLAGS fields of these */
4226 case BOUNDL: /* /\b/l */
4227 case NBOUNDL: /* /\B/l */
4228 PL_reg_flags |= RF_tainted;
4230 case BOUND: /* /\b/ */
4231 case BOUNDU: /* /\b/u */
4232 case BOUNDA: /* /\b/a */
4233 case NBOUND: /* /\B/ */
4234 case NBOUNDU: /* /\B/u */
4235 case NBOUNDA: /* /\B/a */
4236 /* was last char in word? */
4238 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4239 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4241 if (locinput == PL_bostr)
4244 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4246 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4248 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4249 ln = isALNUM_uni(ln);
4253 LOAD_UTF8_CHARCLASS_ALNUM();
4254 n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4259 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4260 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4265 /* Here the string isn't utf8, or is utf8 and only ascii
4266 * characters are to match \w. In the latter case looking at
4267 * the byte just prior to the current one may be just the final
4268 * byte of a multi-byte character. This is ok. There are two
4270 * 1) it is a single byte character, and then the test is doing
4271 * just what it's supposed to.
4272 * 2) it is a multi-byte character, in which case the final
4273 * byte is never mistakable for ASCII, and so the test
4274 * will say it is not a word character, which is the
4275 * correct answer. */
4276 ln = (locinput != PL_bostr) ?
4277 UCHARAT(locinput - 1) : '\n';
4278 switch (FLAGS(scan)) {
4279 case REGEX_UNICODE_CHARSET:
4280 ln = isWORDCHAR_L1(ln);
4281 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4283 case REGEX_LOCALE_CHARSET:
4284 ln = isALNUM_LC(ln);
4285 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4287 case REGEX_DEPENDS_CHARSET:
4289 n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4291 case REGEX_ASCII_RESTRICTED_CHARSET:
4292 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4293 ln = isWORDCHAR_A(ln);
4294 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4297 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4301 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4303 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4307 case ANYOF: /* /[abc]/ */
4311 STRLEN inclasslen = PL_regeol - locinput;
4312 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
4314 locinput += inclasslen;
4318 if (!REGINCLASS(rex, scan, (U8*)locinput))
4325 /* Special char classes: \d, \w etc.
4326 * The defines start on line 166 or so */
4327 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
4328 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4329 ALNUMU, NALNUMU, isWORDCHAR_L1,
4330 ALNUMA, NALNUMA, isWORDCHAR_A,
4333 CCC_TRY_U(SPACE, NSPACE, isSPACE,
4334 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4335 SPACEU, NSPACEU, isSPACE_L1,
4336 SPACEA, NSPACEA, isSPACE_A,
4339 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4340 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4341 DIGITA, NDIGITA, isDIGIT_A,
4344 case POSIXA: /* /[[:ascii:]]/ etc */
4345 if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4348 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4352 case NPOSIXA: /* /[^[:ascii:]]/ etc */
4353 if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4356 goto increment_locinput;
4358 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4359 a Unicode extended Grapheme Cluster */
4360 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4361 extended Grapheme Cluster is:
4364 | Prepend* Begin Extend*
4367 Begin is: ( Special_Begin | ! Control )
4368 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4369 Extend is: ( Grapheme_Extend | Spacing_Mark )
4370 Control is: [ GCB_Control CR LF ]
4371 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4373 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4376 Begin is ( Regular_Begin + Special Begin )
4378 It turns out that 98.4% of all Unicode code points match
4379 Regular_Begin. Doing it this way eliminates a table match in
4380 the previous implementation for almost all Unicode code points.
4382 There is a subtlety with Prepend* which showed up in testing.
4383 Note that the Begin, and only the Begin is required in:
4384 | Prepend* Begin Extend*
4385 Also, Begin contains '! Control'. A Prepend must be a
4386 '! Control', which means it must also be a Begin. What it
4387 comes down to is that if we match Prepend* and then find no
4388 suitable Begin afterwards, that if we backtrack the last
4389 Prepend, that one will be a suitable Begin.
4394 if (! utf8_target) {
4396 /* Match either CR LF or '.', as all the other possibilities
4398 locinput++; /* Match the . or CR */
4399 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4401 && locinput < PL_regeol
4402 && UCHARAT(locinput) == '\n') locinput++;
4406 /* Utf8: See if is ( CR LF ); already know that locinput <
4407 * PL_regeol, so locinput+1 is in bounds */
4408 if ( nextchr == '\r' && locinput+1 < PL_regeol
4409 && UCHARAT(locinput + 1) == '\n')
4416 /* In case have to backtrack to beginning, then match '.' */
4417 char *starting = locinput;
4419 /* In case have to backtrack the last prepend */
4420 char *previous_prepend = 0;
4422 LOAD_UTF8_CHARCLASS_GCB();
4424 /* Match (prepend)* */
4425 while (locinput < PL_regeol
4426 && (len = is_GCB_Prepend_utf8(locinput)))
4428 previous_prepend = locinput;
4432 /* As noted above, if we matched a prepend character, but
4433 * the next thing won't match, back off the last prepend we
4434 * matched, as it is guaranteed to match the begin */
4435 if (previous_prepend
4436 && (locinput >= PL_regeol
4437 || (! swash_fetch(PL_utf8_X_regular_begin,
4438 (U8*)locinput, utf8_target)
4439 && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4442 locinput = previous_prepend;
4445 /* Note that here we know PL_regeol > locinput, as we
4446 * tested that upon input to this switch case, and if we
4447 * moved locinput forward, we tested the result just above
4448 * and it either passed, or we backed off so that it will
4450 if (swash_fetch(PL_utf8_X_regular_begin,
4451 (U8*)locinput, utf8_target)) {
4452 locinput += UTF8SKIP(locinput);
4454 else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4456 /* Here did not match the required 'Begin' in the
4457 * second term. So just match the very first
4458 * character, the '.' of the final term of the regex */
4459 locinput = starting + UTF8SKIP(starting);
4463 /* Here is a special begin. It can be composed of
4464 * several individual characters. One possibility is
4466 if ((len = is_GCB_RI_utf8(locinput))) {
4468 while (locinput < PL_regeol
4469 && (len = is_GCB_RI_utf8(locinput)))
4473 } else if ((len = is_GCB_T_utf8(locinput))) {
4474 /* Another possibility is T+ */
4476 while (locinput < PL_regeol
4477 && (len = is_GCB_T_utf8(locinput)))
4483 /* Here, neither RI+ nor T+; must be some other
4484 * Hangul. That means it is one of the others: L,
4485 * LV, LVT or V, and matches:
4486 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4489 while (locinput < PL_regeol
4490 && (len = is_GCB_L_utf8(locinput)))
4495 /* Here, have exhausted L*. If the next character
4496 * is not an LV, LVT nor V, it means we had to have
4497 * at least one L, so matches L+ in the original
4498 * equation, we have a complete hangul syllable.
4501 if (locinput < PL_regeol
4502 && is_GCB_LV_LVT_V_utf8(locinput))
4505 /* Otherwise keep going. Must be LV, LVT or V.
4507 if (is_utf8_X_LVT((U8*)locinput)) {
4508 locinput += UTF8SKIP(locinput);
4511 /* Must be V or LV. Take it, then match
4513 locinput += UTF8SKIP(locinput);
4514 while (locinput < PL_regeol
4515 && (len = is_GCB_V_utf8(locinput)))
4521 /* And any of LV, LVT, or V can be followed
4523 while (locinput < PL_regeol
4524 && (len = is_GCB_T_utf8(locinput)))
4532 /* Match any extender */
4533 while (locinput < PL_regeol
4534 && swash_fetch(PL_utf8_X_extend,
4535 (U8*)locinput, utf8_target))
4537 locinput += UTF8SKIP(locinput);
4541 if (locinput > PL_regeol) sayNO;
4545 case NREFFL: /* /\g{name}/il */
4546 { /* The capture buffer cases. The ones beginning with N for the
4547 named buffers just convert to the equivalent numbered and
4548 pretend they were called as the corresponding numbered buffer
4550 /* don't initialize these in the declaration, it makes C++
4555 const U8 *fold_array;
4558 PL_reg_flags |= RF_tainted;
4559 folder = foldEQ_locale;
4560 fold_array = PL_fold_locale;
4562 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4565 case NREFFA: /* /\g{name}/iaa */
4566 folder = foldEQ_latin1;
4567 fold_array = PL_fold_latin1;
4569 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4572 case NREFFU: /* /\g{name}/iu */
4573 folder = foldEQ_latin1;
4574 fold_array = PL_fold_latin1;
4576 utf8_fold_flags = 0;
4579 case NREFF: /* /\g{name}/i */
4581 fold_array = PL_fold;
4583 utf8_fold_flags = 0;
4586 case NREF: /* /\g{name}/ */
4590 utf8_fold_flags = 0;
4593 /* For the named back references, find the corresponding buffer
4595 n = reg_check_named_buff_matched(rex,scan);
4600 goto do_nref_ref_common;
4602 case REFFL: /* /\1/il */
4603 PL_reg_flags |= RF_tainted;
4604 folder = foldEQ_locale;
4605 fold_array = PL_fold_locale;
4606 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4609 case REFFA: /* /\1/iaa */
4610 folder = foldEQ_latin1;
4611 fold_array = PL_fold_latin1;
4612 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4615 case REFFU: /* /\1/iu */
4616 folder = foldEQ_latin1;
4617 fold_array = PL_fold_latin1;
4618 utf8_fold_flags = 0;
4621 case REFF: /* /\1/i */
4623 fold_array = PL_fold;
4624 utf8_fold_flags = 0;
4627 case REF: /* /\1/ */
4630 utf8_fold_flags = 0;
4634 n = ARG(scan); /* which paren pair */
4637 ln = rex->offs[n].start;
4638 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4639 if (rex->lastparen < n || ln == -1)
4640 sayNO; /* Do not match unless seen CLOSEn. */
4641 if (ln == rex->offs[n].end)
4645 if (type != REF /* REF can do byte comparison */
4646 && (utf8_target || type == REFFU))
4647 { /* XXX handle REFFL better */
4648 char * limit = PL_regeol;
4650 /* This call case insensitively compares the entire buffer
4651 * at s, with the current input starting at locinput, but
4652 * not going off the end given by PL_regeol, and returns in
4653 * <limit> upon success, how much of the current input was
4655 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4656 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4664 /* Not utf8: Inline the first character, for speed. */
4665 if (!NEXTCHR_IS_EOS &&
4666 UCHARAT(s) != nextchr &&
4668 UCHARAT(s) != fold_array[nextchr]))
4670 ln = rex->offs[n].end - ln;
4671 if (locinput + ln > PL_regeol)
4673 if (ln > 1 && (type == REF
4674 ? memNE(s, locinput, ln)
4675 : ! folder(s, locinput, ln)))
4681 case NOTHING: /* null op; e.g. the 'nothing' following
4682 * the '*' in m{(a+|b)*}' */
4684 case TAIL: /* placeholder while compiling (A|B|C) */
4687 case BACK: /* ??? doesn't appear to be used ??? */
4691 #define ST st->u.eval
4696 regexp_internal *rei;
4697 regnode *startpoint;
4699 case GOSTART: /* (?R) */
4700 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4701 if (cur_eval && cur_eval->locinput==locinput) {
4702 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4703 Perl_croak(aTHX_ "Infinite recursion in regex");
4704 if ( ++nochange_depth > max_nochange_depth )
4706 "Pattern subroutine nesting without pos change"
4707 " exceeded limit in regex");
4714 if (OP(scan)==GOSUB) {
4715 startpoint = scan + ARG2L(scan);
4716 ST.close_paren = ARG(scan);
4718 startpoint = rei->program+1;
4721 goto eval_recurse_doit;
4722 assert(0); /* NOTREACHED */
4724 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4725 if (cur_eval && cur_eval->locinput==locinput) {
4726 if ( ++nochange_depth > max_nochange_depth )
4727 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4732 /* execute the code in the {...} */
4736 OP * const oop = PL_op;
4737 COP * const ocurcop = PL_curcop;
4739 char *saved_regeol = PL_regeol;
4740 struct re_save_state saved_state;
4743 /* save *all* paren positions */
4745 REGCP_SET(runops_cp);
4747 /* To not corrupt the existing regex state while executing the
4748 * eval we would normally put it on the save stack, like with
4749 * save_re_context. However, re-evals have a weird scoping so we
4750 * can't just add ENTER/LEAVE here. With that, things like
4752 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4754 * would break, as they expect the localisation to be unwound
4755 * only when the re-engine backtracks through the bit that
4758 * What we do instead is just saving the state in a local c
4761 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4763 PL_reg_state.re_reparsing = FALSE;
4766 caller_cv = find_runcv(NULL);
4770 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4771 newcv = ((struct regexp *)SvANY(
4772 (REGEXP*)(rexi->data->data[n])
4775 nop = (OP*)rexi->data->data[n+1];
4777 else if (rexi->data->what[n] == 'l') { /* literal code */
4779 nop = (OP*)rexi->data->data[n];
4780 assert(CvDEPTH(newcv));
4783 /* literal with own CV */
4784 assert(rexi->data->what[n] == 'L');
4785 newcv = rex->qr_anoncv;
4786 nop = (OP*)rexi->data->data[n];
4789 /* normally if we're about to execute code from the same
4790 * CV that we used previously, we just use the existing
4791 * CX stack entry. However, its possible that in the
4792 * meantime we may have backtracked, popped from the save
4793 * stack, and undone the SAVECOMPPAD(s) associated with
4794 * PUSH_MULTICALL; in which case PL_comppad no longer
4795 * points to newcv's pad. */
4796 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4798 I32 depth = (newcv == caller_cv) ? 0 : 1;
4799 if (last_pushed_cv) {
4800 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4803 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4805 last_pushed_cv = newcv;
4807 last_pad = PL_comppad;
4809 /* the initial nextstate you would normally execute
4810 * at the start of an eval (which would cause error
4811 * messages to come from the eval), may be optimised
4812 * away from the execution path in the regex code blocks;
4813 * so manually set PL_curcop to it initially */
4815 OP *o = cUNOPx(nop)->op_first;
4816 assert(o->op_type == OP_NULL);
4817 if (o->op_targ == OP_SCOPE) {
4818 o = cUNOPo->op_first;
4821 assert(o->op_targ == OP_LEAVE);
4822 o = cUNOPo->op_first;
4823 assert(o->op_type == OP_ENTER);
4827 if (o->op_type != OP_STUB) {
4828 assert( o->op_type == OP_NEXTSTATE
4829 || o->op_type == OP_DBSTATE
4830 || (o->op_type == OP_NULL
4831 && ( o->op_targ == OP_NEXTSTATE
4832 || o->op_targ == OP_DBSTATE
4836 PL_curcop = (COP*)o;
4841 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4842 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4844 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4847 SV *sv_mrk = get_sv("REGMARK", 1);
4848 sv_setsv(sv_mrk, sv_yes_mark);
4851 /* we don't use MULTICALL here as we want to call the
4852 * first op of the block of interest, rather than the
4853 * first op of the sub */
4856 CALLRUNOPS(aTHX); /* Scalar context. */
4859 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4865 /* before restoring everything, evaluate the returned
4866 * value, so that 'uninit' warnings don't use the wrong
4867 * PL_op or pad. Also need to process any magic vars
4868 * (e.g. $1) *before* parentheses are restored */
4873 if (logical == 0) /* (?{})/ */
4874 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4875 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4876 sw = cBOOL(SvTRUE(ret));
4879 else { /* /(??{}) */
4880 /* if its overloaded, let the regex compiler handle
4881 * it; otherwise extract regex, or stringify */
4882 if (!SvAMAGIC(ret)) {
4886 if (SvTYPE(sv) == SVt_REGEXP)
4887 re_sv = (REGEXP*) sv;
4888 else if (SvSMAGICAL(sv)) {
4889 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4891 re_sv = (REGEXP *) mg->mg_obj;
4894 /* force any magic, undef warnings here */
4896 ret = sv_mortalcopy(ret);
4897 (void) SvPV_force_nolen(ret);
4903 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4905 /* *** Note that at this point we don't restore
4906 * PL_comppad, (or pop the CxSUB) on the assumption it may
4907 * be used again soon. This is safe as long as nothing
4908 * in the regexp code uses the pad ! */
4910 PL_curcop = ocurcop;
4911 PL_regeol = saved_regeol;
4912 S_regcp_restore(aTHX_ rex, runops_cp);
4918 /* only /(??{})/ from now on */
4921 /* extract RE object from returned value; compiling if
4925 re_sv = reg_temp_copy(NULL, re_sv);
4929 const I32 osize = PL_regsize;
4931 if (SvUTF8(ret) && IN_BYTES) {
4932 /* In use 'bytes': make a copy of the octet
4933 * sequence, but without the flag on */
4935 const char *const p = SvPV(ret, len);
4936 ret = newSVpvn_flags(p, len, SVs_TEMP);
4938 if (rex->intflags & PREGf_USE_RE_EVAL)
4939 pm_flags |= PMf_USE_RE_EVAL;
4941 /* if we got here, it should be an engine which
4942 * supports compiling code blocks and stuff */
4943 assert(rex->engine && rex->engine->op_comp);
4944 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4945 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4946 rex->engine, NULL, NULL,
4947 /* copy /msix etc to inner pattern */
4952 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4954 /* This isn't a first class regexp. Instead, it's
4955 caching a regexp onto an existing, Perl visible
4957 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4960 /* safe to do now that any $1 etc has been
4961 * interpolated into the new pattern string and
4963 S_regcp_restore(aTHX_ rex, runops_cp);
4965 re = (struct regexp *)SvANY(re_sv);
4967 RXp_MATCH_COPIED_off(re);
4968 re->subbeg = rex->subbeg;
4969 re->sublen = rex->sublen;
4970 re->suboffset = rex->suboffset;
4971 re->subcoffset = rex->subcoffset;
4974 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4975 "Matching embedded");
4977 startpoint = rei->program + 1;
4978 ST.close_paren = 0; /* only used for GOSUB */
4980 eval_recurse_doit: /* Share code with GOSUB below this line */
4981 /* run the pattern returned from (??{...}) */
4982 ST.cp = regcppush(rex, 0); /* Save *all* the positions. */
4983 REGCP_SET(ST.lastcp);
4986 re->lastcloseparen = 0;
4990 /* XXXX This is too dramatic a measure... */
4993 ST.toggle_reg_flags = PL_reg_flags;
4995 PL_reg_flags |= RF_utf8;
4997 PL_reg_flags &= ~RF_utf8;
4998 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5000 ST.prev_rex = rex_sv;
5001 ST.prev_curlyx = cur_curlyx;
5003 SET_reg_curpm(rex_sv);
5008 ST.prev_eval = cur_eval;
5010 /* now continue from first node in postoned RE */
5011 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5012 assert(0); /* NOTREACHED */
5015 case EVAL_AB: /* cleanup after a successful (??{A})B */
5016 /* note: this is called twice; first after popping B, then A */
5017 PL_reg_flags ^= ST.toggle_reg_flags;
5018 rex_sv = ST.prev_rex;
5019 SET_reg_curpm(rex_sv);
5020 rex = (struct regexp *)SvANY(rex_sv);
5021 rexi = RXi_GET(rex);
5023 cur_eval = ST.prev_eval;
5024 cur_curlyx = ST.prev_curlyx;
5026 /* XXXX This is too dramatic a measure... */
5028 if ( nochange_depth )
5033 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5034 /* note: this is called twice; first after popping B, then A */
5035 PL_reg_flags ^= ST.toggle_reg_flags;
5036 rex_sv = ST.prev_rex;
5037 SET_reg_curpm(rex_sv);
5038 rex = (struct regexp *)SvANY(rex_sv);
5039 rexi = RXi_GET(rex);
5041 REGCP_UNWIND(ST.lastcp);
5043 cur_eval = ST.prev_eval;
5044 cur_curlyx = ST.prev_curlyx;
5045 /* XXXX This is too dramatic a measure... */
5047 if ( nochange_depth )
5053 n = ARG(scan); /* which paren pair */
5054 rex->offs[n].start_tmp = locinput - PL_bostr;
5057 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5058 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5062 (IV)rex->offs[n].start_tmp,
5068 /* XXX really need to log other places start/end are set too */
5069 #define CLOSE_CAPTURE \
5070 rex->offs[n].start = rex->offs[n].start_tmp; \
5071 rex->offs[n].end = locinput - PL_bostr; \
5072 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5073 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5075 PTR2UV(rex->offs), \
5077 (IV)rex->offs[n].start, \
5078 (IV)rex->offs[n].end \
5082 n = ARG(scan); /* which paren pair */
5084 /*if (n > PL_regsize)
5086 if (n > rex->lastparen)
5088 rex->lastcloseparen = n;
5089 if (cur_eval && cur_eval->u.eval.close_paren == n) {
5094 case ACCEPT: /* (*ACCEPT) */
5098 cursor && OP(cursor)!=END;
5099 cursor=regnext(cursor))
5101 if ( OP(cursor)==CLOSE ){
5103 if ( n <= lastopen ) {
5105 /*if (n > PL_regsize)
5107 if (n > rex->lastparen)
5109 rex->lastcloseparen = n;
5110 if ( n == ARG(scan) || (cur_eval &&
5111 cur_eval->u.eval.close_paren == n))
5120 case GROUPP: /* (?(1)) */
5121 n = ARG(scan); /* which paren pair */
5122 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5125 case NGROUPP: /* (?(<name>)) */
5126 /* reg_check_named_buff_matched returns 0 for no match */
5127 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5130 case INSUBP: /* (?(R)) */
5132 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5135 case DEFINEP: /* (?(DEFINE)) */
5139 case IFTHEN: /* (?(cond)A|B) */
5140 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5142 next = NEXTOPER(NEXTOPER(scan));
5144 next = scan + ARG(scan);
5145 if (OP(next) == IFTHEN) /* Fake one. */
5146 next = NEXTOPER(NEXTOPER(next));
5150 case LOGICAL: /* modifier for EVAL and IFMATCH */
5151 logical = scan->flags;
5154 /*******************************************************************
5156 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5157 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5158 STAR/PLUS/CURLY/CURLYN are used instead.)
5160 A*B is compiled as <CURLYX><A><WHILEM><B>
5162 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5163 state, which contains the current count, initialised to -1. It also sets
5164 cur_curlyx to point to this state, with any previous value saved in the
5167 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5168 since the pattern may possibly match zero times (i.e. it's a while {} loop
5169 rather than a do {} while loop).
5171 Each entry to WHILEM represents a successful match of A. The count in the
5172 CURLYX block is incremented, another WHILEM state is pushed, and execution
5173 passes to A or B depending on greediness and the current count.
5175 For example, if matching against the string a1a2a3b (where the aN are
5176 substrings that match /A/), then the match progresses as follows: (the
5177 pushed states are interspersed with the bits of strings matched so far):
5180 <CURLYX cnt=0><WHILEM>
5181 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5182 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5183 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5184 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5186 (Contrast this with something like CURLYM, which maintains only a single
5190 a1 <CURLYM cnt=1> a2
5191 a1 a2 <CURLYM cnt=2> a3
5192 a1 a2 a3 <CURLYM cnt=3> b
5195 Each WHILEM state block marks a point to backtrack to upon partial failure
5196 of A or B, and also contains some minor state data related to that
5197 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5198 overall state, such as the count, and pointers to the A and B ops.
5200 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5201 must always point to the *current* CURLYX block, the rules are:
5203 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5204 and set cur_curlyx to point the new block.
5206 When popping the CURLYX block after a successful or unsuccessful match,
5207 restore the previous cur_curlyx.
5209 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5210 to the outer one saved in the CURLYX block.
5212 When popping the WHILEM block after a successful or unsuccessful B match,
5213 restore the previous cur_curlyx.
5215 Here's an example for the pattern (AI* BI)*BO
5216 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5219 curlyx backtrack stack
5220 ------ ---------------
5222 CO <CO prev=NULL> <WO>
5223 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5224 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5225 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5227 At this point the pattern succeeds, and we work back down the stack to
5228 clean up, restoring as we go:
5230 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5231 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5232 CO <CO prev=NULL> <WO>
5235 *******************************************************************/
5237 #define ST st->u.curlyx
5239 case CURLYX: /* start of /A*B/ (for complex A) */
5241 /* No need to save/restore up to this paren */
5242 I32 parenfloor = scan->flags;
5244 assert(next); /* keep Coverity happy */
5245 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5248 /* XXXX Probably it is better to teach regpush to support
5249 parenfloor > PL_regsize... */
5250 if (parenfloor > (I32)rex->lastparen)
5251 parenfloor = rex->lastparen; /* Pessimization... */
5253 ST.prev_curlyx= cur_curlyx;
5255 ST.cp = PL_savestack_ix;
5257 /* these fields contain the state of the current curly.
5258 * they are accessed by subsequent WHILEMs */
5259 ST.parenfloor = parenfloor;
5264 ST.count = -1; /* this will be updated by WHILEM */
5265 ST.lastloc = NULL; /* this will be updated by WHILEM */
5267 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5268 assert(0); /* NOTREACHED */
5271 case CURLYX_end: /* just finished matching all of A*B */
5272 cur_curlyx = ST.prev_curlyx;
5274 assert(0); /* NOTREACHED */
5276 case CURLYX_end_fail: /* just failed to match all of A*B */
5278 cur_curlyx = ST.prev_curlyx;
5280 assert(0); /* NOTREACHED */
5284 #define ST st->u.whilem
5286 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5288 /* see the discussion above about CURLYX/WHILEM */
5290 int min = ARG1(cur_curlyx->u.curlyx.me);
5291 int max = ARG2(cur_curlyx->u.curlyx.me);
5292 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5294 assert(cur_curlyx); /* keep Coverity happy */
5295 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5296 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5297 ST.cache_offset = 0;
5301 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5302 "%*s whilem: matched %ld out of %d..%d\n",
5303 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5306 /* First just match a string of min A's. */
5309 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5310 cur_curlyx->u.curlyx.lastloc = locinput;
5311 REGCP_SET(ST.lastcp);
5313 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5314 assert(0); /* NOTREACHED */
5317 /* If degenerate A matches "", assume A done. */
5319 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5320 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5321 "%*s whilem: empty match detected, trying continuation...\n",
5322 REPORT_CODE_OFF+depth*2, "")
5324 goto do_whilem_B_max;
5327 /* super-linear cache processing */
5331 if (!PL_reg_maxiter) {
5332 /* start the countdown: Postpone detection until we
5333 * know the match is not *that* much linear. */
5334 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5335 /* possible overflow for long strings and many CURLYX's */
5336 if (PL_reg_maxiter < 0)
5337 PL_reg_maxiter = I32_MAX;
5338 PL_reg_leftiter = PL_reg_maxiter;
5341 if (PL_reg_leftiter-- == 0) {
5342 /* initialise cache */
5343 const I32 size = (PL_reg_maxiter + 7)/8;
5344 if (PL_reg_poscache) {
5345 if ((I32)PL_reg_poscache_size < size) {
5346 Renew(PL_reg_poscache, size, char);
5347 PL_reg_poscache_size = size;
5349 Zero(PL_reg_poscache, size, char);
5352 PL_reg_poscache_size = size;
5353 Newxz(PL_reg_poscache, size, char);
5355 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5356 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5357 PL_colors[4], PL_colors[5])
5361 if (PL_reg_leftiter < 0) {
5362 /* have we already failed at this position? */
5364 offset = (scan->flags & 0xf) - 1
5365 + (locinput - PL_bostr) * (scan->flags>>4);
5366 mask = 1 << (offset % 8);
5368 if (PL_reg_poscache[offset] & mask) {
5369 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5370 "%*s whilem: (cache) already tried at this position...\n",
5371 REPORT_CODE_OFF+depth*2, "")
5373 sayNO; /* cache records failure */
5375 ST.cache_offset = offset;
5376 ST.cache_mask = mask;
5380 /* Prefer B over A for minimal matching. */
5382 if (cur_curlyx->u.curlyx.minmod) {
5383 ST.save_curlyx = cur_curlyx;
5384 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5385 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5386 REGCP_SET(ST.lastcp);
5387 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5389 assert(0); /* NOTREACHED */
5392 /* Prefer A over B for maximal matching. */
5394 if (n < max) { /* More greed allowed? */
5395 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5396 cur_curlyx->u.curlyx.lastloc = locinput;
5397 REGCP_SET(ST.lastcp);
5398 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5399 assert(0); /* NOTREACHED */
5401 goto do_whilem_B_max;
5403 assert(0); /* NOTREACHED */
5405 case WHILEM_B_min: /* just matched B in a minimal match */
5406 case WHILEM_B_max: /* just matched B in a maximal match */
5407 cur_curlyx = ST.save_curlyx;
5409 assert(0); /* NOTREACHED */
5411 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5412 cur_curlyx = ST.save_curlyx;
5413 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5414 cur_curlyx->u.curlyx.count--;
5416 assert(0); /* NOTREACHED */
5418 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5420 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5421 REGCP_UNWIND(ST.lastcp);
5423 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5424 cur_curlyx->u.curlyx.count--;
5426 assert(0); /* NOTREACHED */
5428 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5429 REGCP_UNWIND(ST.lastcp);
5430 regcppop(rex); /* Restore some previous $<digit>s? */
5431 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5432 "%*s whilem: failed, trying continuation...\n",
5433 REPORT_CODE_OFF+depth*2, "")
5436 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5437 && ckWARN(WARN_REGEXP)
5438 && !(PL_reg_flags & RF_warned))
5440 PL_reg_flags |= RF_warned;
5441 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5442 "Complex regular subexpression recursion limit (%d) "
5448 ST.save_curlyx = cur_curlyx;
5449 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5450 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5452 assert(0); /* NOTREACHED */
5454 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5455 cur_curlyx = ST.save_curlyx;
5456 REGCP_UNWIND(ST.lastcp);
5459 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5460 /* Maximum greed exceeded */
5461 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5462 && ckWARN(WARN_REGEXP)
5463 && !(PL_reg_flags & RF_warned))
5465 PL_reg_flags |= RF_warned;
5466 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5467 "Complex regular subexpression recursion "
5468 "limit (%d) exceeded",
5471 cur_curlyx->u.curlyx.count--;
5475 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5476 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5478 /* Try grabbing another A and see if it helps. */
5479 cur_curlyx->u.curlyx.lastloc = locinput;
5480 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5481 REGCP_SET(ST.lastcp);
5482 PUSH_STATE_GOTO(WHILEM_A_min,
5483 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5485 assert(0); /* NOTREACHED */
5488 #define ST st->u.branch
5490 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5491 next = scan + ARG(scan);
5494 scan = NEXTOPER(scan);
5497 case BRANCH: /* /(...|A|...)/ */
5498 scan = NEXTOPER(scan); /* scan now points to inner node */
5499 ST.lastparen = rex->lastparen;
5500 ST.lastcloseparen = rex->lastcloseparen;
5501 ST.next_branch = next;
5504 /* Now go into the branch */
5506 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5508 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5510 assert(0); /* NOTREACHED */
5512 case CUTGROUP: /* /(*THEN)/ */
5513 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5514 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5515 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5516 assert(0); /* NOTREACHED */
5518 case CUTGROUP_next_fail:
5521 if (st->u.mark.mark_name)
5522 sv_commit = st->u.mark.mark_name;
5524 assert(0); /* NOTREACHED */
5528 assert(0); /* NOTREACHED */
5530 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5535 REGCP_UNWIND(ST.cp);
5536 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5537 scan = ST.next_branch;
5538 /* no more branches? */
5539 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5541 PerlIO_printf( Perl_debug_log,
5542 "%*s %sBRANCH failed...%s\n",
5543 REPORT_CODE_OFF+depth*2, "",
5549 continue; /* execute next BRANCH[J] op */
5550 assert(0); /* NOTREACHED */
5552 case MINMOD: /* next op will be non-greedy, e.g. A*? */
5557 #define ST st->u.curlym
5559 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5561 /* This is an optimisation of CURLYX that enables us to push
5562 * only a single backtracking state, no matter how many matches
5563 * there are in {m,n}. It relies on the pattern being constant
5564 * length, with no parens to influence future backrefs
5568 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5570 ST.lastparen = rex->lastparen;
5571 ST.lastcloseparen = rex->lastcloseparen;
5573 /* if paren positive, emulate an OPEN/CLOSE around A */
5575 U32 paren = ST.me->flags;
5576 if (paren > PL_regsize)
5578 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5586 ST.c1 = CHRTEST_UNINIT;
5589 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5592 curlym_do_A: /* execute the A in /A{m,n}B/ */
5593 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5594 assert(0); /* NOTREACHED */
5596 case CURLYM_A: /* we've just matched an A */
5598 /* after first match, determine A's length: u.curlym.alen */
5599 if (ST.count == 1) {
5600 if (PL_reg_match_utf8) {
5601 char *s = st->locinput;
5602 while (s < locinput) {
5608 ST.alen = locinput - st->locinput;
5611 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5614 PerlIO_printf(Perl_debug_log,
5615 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5616 (int)(REPORT_CODE_OFF+(depth*2)), "",
5617 (IV) ST.count, (IV)ST.alen)
5620 if (cur_eval && cur_eval->u.eval.close_paren &&
5621 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5625 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5626 if ( max == REG_INFTY || ST.count < max )
5627 goto curlym_do_A; /* try to match another A */
5629 goto curlym_do_B; /* try to match B */
5631 case CURLYM_A_fail: /* just failed to match an A */
5632 REGCP_UNWIND(ST.cp);
5634 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5635 || (cur_eval && cur_eval->u.eval.close_paren &&
5636 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5639 curlym_do_B: /* execute the B in /A{m,n}B/ */
5640 if (ST.c1 == CHRTEST_UNINIT) {
5641 /* calculate c1 and c2 for possible match of 1st char
5642 * following curly */
5643 ST.c1 = ST.c2 = CHRTEST_VOID;
5644 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5645 regnode *text_node = ST.B;
5646 if (! HAS_TEXT(text_node))
5647 FIND_NEXT_IMPT(text_node);
5650 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5652 But the former is redundant in light of the latter.
5654 if this changes back then the macro for
5655 IS_TEXT and friends need to change.
5657 if (PL_regkind[OP(text_node)] == EXACT) {
5658 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5659 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5668 PerlIO_printf(Perl_debug_log,
5669 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5670 (int)(REPORT_CODE_OFF+(depth*2)),
5673 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5674 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5675 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5676 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5678 /* simulate B failing */
5680 PerlIO_printf(Perl_debug_log,
5681 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5682 (int)(REPORT_CODE_OFF+(depth*2)),"",
5683 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5684 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5685 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5687 state_num = CURLYM_B_fail;
5688 goto reenter_switch;
5691 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5692 /* simulate B failing */
5694 PerlIO_printf(Perl_debug_log,
5695 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5696 (int)(REPORT_CODE_OFF+(depth*2)),"",
5697 (int) nextchr, ST.c1, ST.c2)
5699 state_num = CURLYM_B_fail;
5700 goto reenter_switch;
5705 /* emulate CLOSE: mark current A as captured */
5706 I32 paren = ST.me->flags;
5708 rex->offs[paren].start
5709 = HOPc(locinput, -ST.alen) - PL_bostr;
5710 rex->offs[paren].end = locinput - PL_bostr;
5711 if ((U32)paren > rex->lastparen)
5712 rex->lastparen = paren;
5713 rex->lastcloseparen = paren;
5716 rex->offs[paren].end = -1;
5717 if (cur_eval && cur_eval->u.eval.close_paren &&
5718 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5727 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5728 assert(0); /* NOTREACHED */
5730 case CURLYM_B_fail: /* just failed to match a B */
5731 REGCP_UNWIND(ST.cp);
5732 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5734 I32 max = ARG2(ST.me);
5735 if (max != REG_INFTY && ST.count == max)
5737 goto curlym_do_A; /* try to match a further A */
5739 /* backtrack one A */
5740 if (ST.count == ARG1(ST.me) /* min */)
5743 SET_locinput(HOPc(locinput, -ST.alen));
5744 goto curlym_do_B; /* try to match B */
5747 #define ST st->u.curly
5749 #define CURLY_SETPAREN(paren, success) \
5752 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5753 rex->offs[paren].end = locinput - PL_bostr; \
5754 if (paren > rex->lastparen) \
5755 rex->lastparen = paren; \
5756 rex->lastcloseparen = paren; \
5759 rex->offs[paren].end = -1; \
5760 rex->lastparen = ST.lastparen; \
5761 rex->lastcloseparen = ST.lastcloseparen; \
5765 case STAR: /* /A*B/ where A is width 1 char */
5769 scan = NEXTOPER(scan);
5772 case PLUS: /* /A+B/ where A is width 1 char */
5776 scan = NEXTOPER(scan);
5779 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5780 ST.paren = scan->flags; /* Which paren to set */
5781 ST.lastparen = rex->lastparen;
5782 ST.lastcloseparen = rex->lastcloseparen;
5783 if (ST.paren > PL_regsize)
5784 PL_regsize = ST.paren;
5785 ST.min = ARG1(scan); /* min to match */
5786 ST.max = ARG2(scan); /* max to match */
5787 if (cur_eval && cur_eval->u.eval.close_paren &&
5788 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5792 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5795 case CURLY: /* /A{m,n}B/ where A is width 1 char */
5797 ST.min = ARG1(scan); /* min to match */
5798 ST.max = ARG2(scan); /* max to match */
5799 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5802 * Lookahead to avoid useless match attempts
5803 * when we know what character comes next.
5805 * Used to only do .*x and .*?x, but now it allows
5806 * for )'s, ('s and (?{ ... })'s to be in the way
5807 * of the quantifier and the EXACT-like node. -- japhy
5810 assert(ST.min <= ST.max);
5811 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5812 ST.c1 = ST.c2 = CHRTEST_VOID;
5815 regnode *text_node = next;
5817 if (! HAS_TEXT(text_node))
5818 FIND_NEXT_IMPT(text_node);
5820 if (! HAS_TEXT(text_node))
5821 ST.c1 = ST.c2 = CHRTEST_VOID;
5823 if ( PL_regkind[OP(text_node)] != EXACT ) {
5824 ST.c1 = ST.c2 = CHRTEST_VOID;
5828 /* Currently we only get here when
5830 PL_rekind[OP(text_node)] == EXACT
5832 if this changes back then the macro for IS_TEXT and
5833 friends need to change. */
5834 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5835 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5846 char *li = locinput;
5848 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5853 if (ST.c1 == CHRTEST_VOID)
5854 goto curly_try_B_min;
5856 ST.oldloc = locinput;
5858 /* set ST.maxpos to the furthest point along the
5859 * string that could possibly match */
5860 if (ST.max == REG_INFTY) {
5861 ST.maxpos = PL_regeol - 1;
5863 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5866 else if (utf8_target) {
5867 int m = ST.max - ST.min;
5868 for (ST.maxpos = locinput;
5869 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5870 ST.maxpos += UTF8SKIP(ST.maxpos);
5873 ST.maxpos = locinput + ST.max - ST.min;
5874 if (ST.maxpos >= PL_regeol)
5875 ST.maxpos = PL_regeol - 1;
5877 goto curly_try_B_min_known;
5881 /* avoid taking address of locinput, so it can remain
5883 char *li = locinput;
5884 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5885 if (ST.count < ST.min)
5888 if ((ST.count > ST.min)
5889 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5891 /* A{m,n} must come at the end of the string, there's
5892 * no point in backing off ... */
5894 /* ...except that $ and \Z can match before *and* after
5895 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5896 We may back off by one in this case. */
5897 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5901 goto curly_try_B_max;
5903 assert(0); /* NOTREACHED */
5906 case CURLY_B_min_known_fail:
5907 /* failed to find B in a non-greedy match where c1,c2 valid */
5909 REGCP_UNWIND(ST.cp);
5911 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5913 /* Couldn't or didn't -- move forward. */
5914 ST.oldloc = locinput;
5916 locinput += UTF8SKIP(locinput);
5920 curly_try_B_min_known:
5921 /* find the next place where 'B' could work, then call B */
5925 n = (ST.oldloc == locinput) ? 0 : 1;
5926 if (ST.c1 == ST.c2) {
5927 /* set n to utf8_distance(oldloc, locinput) */
5928 while (locinput <= ST.maxpos
5929 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5931 locinput += UTF8SKIP(locinput);
5936 /* set n to utf8_distance(oldloc, locinput) */
5937 while (locinput <= ST.maxpos
5938 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5939 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5941 locinput += UTF8SKIP(locinput);
5946 else { /* Not utf8_target */
5947 if (ST.c1 == ST.c2) {
5948 while (locinput <= ST.maxpos &&
5949 UCHARAT(locinput) != ST.c1)
5953 while (locinput <= ST.maxpos
5954 && UCHARAT(locinput) != ST.c1
5955 && UCHARAT(locinput) != ST.c2)
5958 n = locinput - ST.oldloc;
5960 if (locinput > ST.maxpos)
5963 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5964 * at b; check that everything between oldloc and
5965 * locinput matches */
5966 char *li = ST.oldloc;
5968 if (regrepeat(rex, &li, ST.A, n, depth) < n)
5970 assert(n == REG_INFTY || locinput == li);
5972 CURLY_SETPAREN(ST.paren, ST.count);
5973 if (cur_eval && cur_eval->u.eval.close_paren &&
5974 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5977 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
5979 assert(0); /* NOTREACHED */
5982 case CURLY_B_min_fail:
5983 /* failed to find B in a non-greedy match where c1,c2 invalid */
5985 REGCP_UNWIND(ST.cp);
5987 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5989 /* failed -- move forward one */
5991 char *li = locinput;
5992 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
5999 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6000 ST.count > 0)) /* count overflow ? */
6003 CURLY_SETPAREN(ST.paren, ST.count);
6004 if (cur_eval && cur_eval->u.eval.close_paren &&
6005 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6008 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6012 assert(0); /* NOTREACHED */
6016 /* a successful greedy match: now try to match B */
6017 if (cur_eval && cur_eval->u.eval.close_paren &&
6018 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6022 bool could_match = locinput < PL_regeol;
6024 /* If it could work, try it. */
6025 if (ST.c1 != CHRTEST_VOID && could_match) {
6026 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6028 could_match = memEQ(locinput,
6033 UTF8SKIP(locinput));
6036 could_match = UCHARAT(locinput) == ST.c1
6037 || UCHARAT(locinput) == ST.c2;
6040 if (ST.c1 == CHRTEST_VOID || could_match) {
6041 CURLY_SETPAREN(ST.paren, ST.count);
6042 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6043 assert(0); /* NOTREACHED */
6048 case CURLY_B_max_fail:
6049 /* failed to find B in a greedy match */
6051 REGCP_UNWIND(ST.cp);
6053 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6056 if (--ST.count < ST.min)
6058 locinput = HOPc(locinput, -1);
6059 goto curly_try_B_max;
6063 case END: /* last op of main pattern */
6066 /* we've just finished A in /(??{A})B/; now continue with B */
6067 st->u.eval.toggle_reg_flags
6068 = cur_eval->u.eval.toggle_reg_flags;
6069 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6071 st->u.eval.prev_rex = rex_sv; /* inner */
6072 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
6073 rex_sv = cur_eval->u.eval.prev_rex;
6074 SET_reg_curpm(rex_sv);
6075 rex = (struct regexp *)SvANY(rex_sv);
6076 rexi = RXi_GET(rex);
6077 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6079 REGCP_SET(st->u.eval.lastcp);
6081 /* Restore parens of the outer rex without popping the
6083 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
6085 st->u.eval.prev_eval = cur_eval;
6086 cur_eval = cur_eval->u.eval.prev_eval;
6088 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6089 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6090 if ( nochange_depth )
6093 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6094 locinput); /* match B */
6097 if (locinput < reginfo->till) {
6098 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6099 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6101 (long)(locinput - PL_reg_starttry),
6102 (long)(reginfo->till - PL_reg_starttry),
6105 sayNO_SILENT; /* Cannot match: too short. */
6107 sayYES; /* Success! */
6109 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6111 PerlIO_printf(Perl_debug_log,
6112 "%*s %ssubpattern success...%s\n",
6113 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6114 sayYES; /* Success! */
6117 #define ST st->u.ifmatch
6122 case SUSPEND: /* (?>A) */
6124 newstart = locinput;
6127 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6129 goto ifmatch_trivial_fail_test;
6131 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6133 ifmatch_trivial_fail_test:
6135 char * const s = HOPBACKc(locinput, scan->flags);
6140 sw = 1 - cBOOL(ST.wanted);
6144 next = scan + ARG(scan);
6152 newstart = locinput;
6156 ST.logical = logical;
6157 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6159 /* execute body of (?...A) */
6160 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6161 assert(0); /* NOTREACHED */
6164 case IFMATCH_A_fail: /* body of (?...A) failed */
6165 ST.wanted = !ST.wanted;
6168 case IFMATCH_A: /* body of (?...A) succeeded */
6170 sw = cBOOL(ST.wanted);
6172 else if (!ST.wanted)
6175 if (OP(ST.me) != SUSPEND) {
6176 /* restore old position except for (?>...) */
6177 locinput = st->locinput;
6179 scan = ST.me + ARG(ST.me);
6182 continue; /* execute B */
6186 case LONGJMP: /* alternative with many branches compiles to
6187 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6188 next = scan + ARG(scan);
6193 case COMMIT: /* (*COMMIT) */
6194 reginfo->cutpoint = PL_regeol;
6197 case PRUNE: /* (*PRUNE) */
6199 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6200 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6201 assert(0); /* NOTREACHED */
6203 case COMMIT_next_fail:
6207 case OPFAIL: /* (*FAIL) */
6209 assert(0); /* NOTREACHED */
6211 #define ST st->u.mark
6212 case MARKPOINT: /* (*MARK:foo) */
6213 ST.prev_mark = mark_state;
6214 ST.mark_name = sv_commit = sv_yes_mark
6215 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6217 ST.mark_loc = locinput;
6218 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6219 assert(0); /* NOTREACHED */
6221 case MARKPOINT_next:
6222 mark_state = ST.prev_mark;
6224 assert(0); /* NOTREACHED */
6226 case MARKPOINT_next_fail:
6227 if (popmark && sv_eq(ST.mark_name,popmark))
6229 if (ST.mark_loc > startpoint)
6230 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6231 popmark = NULL; /* we found our mark */
6232 sv_commit = ST.mark_name;
6235 PerlIO_printf(Perl_debug_log,
6236 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6237 REPORT_CODE_OFF+depth*2, "",
6238 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6241 mark_state = ST.prev_mark;
6242 sv_yes_mark = mark_state ?
6243 mark_state->u.mark.mark_name : NULL;
6245 assert(0); /* NOTREACHED */
6247 case SKIP: /* (*SKIP) */
6249 /* (*SKIP) : if we fail we cut here*/
6250 ST.mark_name = NULL;
6251 ST.mark_loc = locinput;
6252 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6254 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6255 otherwise do nothing. Meaning we need to scan
6257 regmatch_state *cur = mark_state;
6258 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6261 if ( sv_eq( cur->u.mark.mark_name,
6264 ST.mark_name = find;
6265 PUSH_STATE_GOTO( SKIP_next, next, locinput);
6267 cur = cur->u.mark.prev_mark;
6270 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6273 case SKIP_next_fail:
6275 /* (*CUT:NAME) - Set up to search for the name as we
6276 collapse the stack*/
6277 popmark = ST.mark_name;
6279 /* (*CUT) - No name, we cut here.*/
6280 if (ST.mark_loc > startpoint)
6281 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6282 /* but we set sv_commit to latest mark_name if there
6283 is one so they can test to see how things lead to this
6286 sv_commit=mark_state->u.mark.mark_name;
6290 assert(0); /* NOTREACHED */
6293 case LNBREAK: /* \R */
6294 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6300 #define CASE_CLASS(nAmE) \
6302 if (NEXTCHR_IS_EOS) \
6304 if ((n=is_##nAmE(locinput,utf8_target))) { \
6310 if (NEXTCHR_IS_EOS) \
6312 if ((n=is_##nAmE(locinput,utf8_target))) { \
6315 locinput += UTF8SKIP(locinput); \
6319 CASE_CLASS(VERTWS); /* \v \V */
6320 CASE_CLASS(HORIZWS); /* \h \H */
6324 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6325 PTR2UV(scan), OP(scan));
6326 Perl_croak(aTHX_ "regexp memory corruption");
6328 /* this is a point to jump to in order to increment
6329 * locinput by one character */
6331 assert(!NEXTCHR_IS_EOS);
6333 locinput += PL_utf8skip[nextchr];
6334 /* locinput is allowed to go 1 char off the end, but not 2+ */
6335 if (locinput > PL_regeol)
6344 /* switch break jumps here */
6345 scan = next; /* prepare to execute the next op and ... */
6346 continue; /* ... jump back to the top, reusing st */
6347 assert(0); /* NOTREACHED */
6350 /* push a state that backtracks on success */
6351 st->u.yes.prev_yes_state = yes_state;
6355 /* push a new regex state, then continue at scan */
6357 regmatch_state *newst;
6360 regmatch_state *cur = st;
6361 regmatch_state *curyes = yes_state;
6363 regmatch_slab *slab = PL_regmatch_slab;
6364 for (;curd > -1;cur--,curd--) {
6365 if (cur < SLAB_FIRST(slab)) {
6367 cur = SLAB_LAST(slab);
6369 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6370 REPORT_CODE_OFF + 2 + depth * 2,"",
6371 curd, PL_reg_name[cur->resume_state],
6372 (curyes == cur) ? "yes" : ""
6375 curyes = cur->u.yes.prev_yes_state;
6378 DEBUG_STATE_pp("push")
6381 st->locinput = locinput;
6383 if (newst > SLAB_LAST(PL_regmatch_slab))
6384 newst = S_push_slab(aTHX);
6385 PL_regmatch_state = newst;
6387 locinput = pushinput;
6390 assert(0); /* NOTREACHED */
6395 * We get here only if there's trouble -- normally "case END" is
6396 * the terminating point.
6398 Perl_croak(aTHX_ "corrupted regexp pointers");
6404 /* we have successfully completed a subexpression, but we must now
6405 * pop to the state marked by yes_state and continue from there */
6406 assert(st != yes_state);
6408 while (st != yes_state) {
6410 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6411 PL_regmatch_slab = PL_regmatch_slab->prev;
6412 st = SLAB_LAST(PL_regmatch_slab);
6416 DEBUG_STATE_pp("pop (no final)");
6418 DEBUG_STATE_pp("pop (yes)");
6424 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6425 || yes_state > SLAB_LAST(PL_regmatch_slab))
6427 /* not in this slab, pop slab */
6428 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6429 PL_regmatch_slab = PL_regmatch_slab->prev;
6430 st = SLAB_LAST(PL_regmatch_slab);
6432 depth -= (st - yes_state);
6435 yes_state = st->u.yes.prev_yes_state;
6436 PL_regmatch_state = st;
6439 locinput= st->locinput;
6440 state_num = st->resume_state + no_final;
6441 goto reenter_switch;
6444 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6445 PL_colors[4], PL_colors[5]));
6447 if (PL_reg_state.re_state_eval_setup_done) {
6448 /* each successfully executed (?{...}) block does the equivalent of
6449 * local $^R = do {...}
6450 * When popping the save stack, all these locals would be undone;
6451 * bypass this by setting the outermost saved $^R to the latest
6453 if (oreplsv != GvSV(PL_replgv))
6454 sv_setsv(oreplsv, GvSV(PL_replgv));
6461 PerlIO_printf(Perl_debug_log,
6462 "%*s %sfailed...%s\n",
6463 REPORT_CODE_OFF+depth*2, "",
6464 PL_colors[4], PL_colors[5])
6476 /* there's a previous state to backtrack to */
6478 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6479 PL_regmatch_slab = PL_regmatch_slab->prev;
6480 st = SLAB_LAST(PL_regmatch_slab);
6482 PL_regmatch_state = st;
6483 locinput= st->locinput;
6485 DEBUG_STATE_pp("pop");
6487 if (yes_state == st)
6488 yes_state = st->u.yes.prev_yes_state;
6490 state_num = st->resume_state + 1; /* failure = success + 1 */
6491 goto reenter_switch;
6496 if (rex->intflags & PREGf_VERBARG_SEEN) {
6497 SV *sv_err = get_sv("REGERROR", 1);
6498 SV *sv_mrk = get_sv("REGMARK", 1);
6500 sv_commit = &PL_sv_no;
6502 sv_yes_mark = &PL_sv_yes;
6505 sv_commit = &PL_sv_yes;
6506 sv_yes_mark = &PL_sv_no;
6508 sv_setsv(sv_err, sv_commit);
6509 sv_setsv(sv_mrk, sv_yes_mark);
6513 if (last_pushed_cv) {
6516 PERL_UNUSED_VAR(SP);
6519 /* clean up; in particular, free all slabs above current one */
6520 LEAVE_SCOPE(oldsave);
6522 assert(!result || locinput - PL_bostr >= 0);
6523 return result ? locinput - PL_bostr : -1;
6527 - regrepeat - repeatedly match something simple, report how many
6529 * startposp - pointer a pointer to the start position. This is updated
6530 * to point to the byte following the highest successful
6532 * p - the regnode to be repeatedly matched against.
6533 * max - maximum number of characters to match.
6534 * depth - (for debugging) backtracking depth.
6537 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6542 char *loceol = PL_regeol;
6544 bool utf8_target = PL_reg_match_utf8;
6547 PERL_UNUSED_ARG(depth);
6550 PERL_ARGS_ASSERT_REGREPEAT;
6553 if (max == REG_INFTY)
6555 else if (max < loceol - scan)
6556 loceol = scan + max;
6561 while (scan < loceol && hardcount < max && *scan != '\n') {
6562 scan += UTF8SKIP(scan);
6566 while (scan < loceol && *scan != '\n')
6573 while (scan < loceol && hardcount < max) {
6574 scan += UTF8SKIP(scan);
6585 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6589 /* Can use a simple loop if the pattern char to match on is invariant
6590 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6591 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6592 * true iff it doesn't matter if the argument is in UTF-8 or not */
6593 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6594 while (scan < loceol && UCHARAT(scan) == c) {
6598 else if (UTF_PATTERN) {
6600 STRLEN scan_char_len;
6603 /* When both target and pattern are UTF-8, we have to do s
6605 while (hardcount < max
6606 && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6607 && scan_char_len <= STR_LEN(p)
6608 && memEQ(scan, STRING(p), scan_char_len))
6610 scan += scan_char_len;
6614 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6616 /* Target isn't utf8; convert the character in the UTF-8
6617 * pattern to non-UTF8, and do a simple loop */
6618 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6619 while (scan < loceol && UCHARAT(scan) == c) {
6622 } /* else pattern char is above Latin1, can't possibly match the
6627 /* Here, the string must be utf8; pattern isn't, and <c> is
6628 * different in utf8 than not, so can't compare them directly.
6629 * Outside the loop, find the two utf8 bytes that represent c, and
6630 * then look for those in sequence in the utf8 string */
6631 U8 high = UTF8_TWO_BYTE_HI(c);
6632 U8 low = UTF8_TWO_BYTE_LO(c);
6635 while (hardcount < max
6636 && scan + 1 < loceol
6637 && UCHARAT(scan) == high
6638 && UCHARAT(scan + 1) == low)
6647 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6651 PL_reg_flags |= RF_tainted;
6652 utf8_flags = FOLDEQ_UTF8_LOCALE;
6660 case EXACTFU_TRICKYFOLD:
6662 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6666 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6668 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6670 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6671 if (c1 == CHRTEST_VOID) {
6672 /* Use full Unicode fold matching */
6673 char *tmpeol = loceol;
6674 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6675 while (hardcount < max
6676 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6677 STRING(p), NULL, pat_len,
6678 cBOOL(UTF_PATTERN), utf8_flags))
6685 else if (utf8_target) {
6687 while (hardcount < max
6688 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6690 scan += UTF8SKIP(scan);
6695 while (hardcount < max
6696 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6697 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6699 scan += UTF8SKIP(scan);
6704 else if (c1 == c2) {
6705 while (scan < loceol && UCHARAT(scan) == c1) {
6710 while (scan < loceol &&
6711 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6723 inclasslen = loceol - scan;
6724 while (hardcount < max
6725 && ((inclasslen = loceol - scan) > 0)
6726 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6732 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6740 LOAD_UTF8_CHARCLASS_ALNUM();
6741 while (hardcount < max && scan < loceol &&
6742 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6744 scan += UTF8SKIP(scan);
6748 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6756 while (scan < loceol && isALNUM((U8) *scan)) {
6761 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6766 PL_reg_flags |= RF_tainted;
6769 while (hardcount < max && scan < loceol &&
6770 isALNUM_LC_utf8((U8*)scan)) {
6771 scan += UTF8SKIP(scan);
6775 while (scan < loceol && isALNUM_LC(*scan))
6785 LOAD_UTF8_CHARCLASS_ALNUM();
6786 while (hardcount < max && scan < loceol &&
6787 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6789 scan += UTF8SKIP(scan);
6793 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6800 goto utf8_Nwordchar;
6801 while (scan < loceol && ! isALNUM((U8) *scan)) {
6807 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6813 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6814 scan += UTF8SKIP(scan);
6818 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6825 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6826 scan += UTF8SKIP(scan);
6830 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6836 PL_reg_flags |= RF_tainted;
6839 while (hardcount < max && scan < loceol &&
6840 !isALNUM_LC_utf8((U8*)scan)) {
6841 scan += UTF8SKIP(scan);
6845 while (scan < loceol && !isALNUM_LC(*scan))
6855 LOAD_UTF8_CHARCLASS_SPACE();
6856 while (hardcount < max && scan < loceol &&
6858 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6860 scan += UTF8SKIP(scan);
6866 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6875 while (scan < loceol && isSPACE((U8) *scan)) {
6880 while (scan < loceol && isSPACE_A((U8) *scan)) {
6885 PL_reg_flags |= RF_tainted;
6888 while (hardcount < max && scan < loceol &&
6889 isSPACE_LC_utf8((U8*)scan)) {
6890 scan += UTF8SKIP(scan);
6894 while (scan < loceol && isSPACE_LC(*scan))
6904 LOAD_UTF8_CHARCLASS_SPACE();
6905 while (hardcount < max && scan < loceol &&
6907 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6909 scan += UTF8SKIP(scan);
6915 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6924 while (scan < loceol && ! isSPACE((U8) *scan)) {
6930 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6931 scan += UTF8SKIP(scan);
6935 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6941 PL_reg_flags |= RF_tainted;
6944 while (hardcount < max && scan < loceol &&
6945 !isSPACE_LC_utf8((U8*)scan)) {
6946 scan += UTF8SKIP(scan);
6950 while (scan < loceol && !isSPACE_LC(*scan))
6957 LOAD_UTF8_CHARCLASS_DIGIT();
6958 while (hardcount < max && scan < loceol &&
6959 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6960 scan += UTF8SKIP(scan);
6964 while (scan < loceol && isDIGIT(*scan))
6969 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6974 PL_reg_flags |= RF_tainted;
6977 while (hardcount < max && scan < loceol &&
6978 isDIGIT_LC_utf8((U8*)scan)) {
6979 scan += UTF8SKIP(scan);
6983 while (scan < loceol && isDIGIT_LC(*scan))
6990 LOAD_UTF8_CHARCLASS_DIGIT();
6991 while (hardcount < max && scan < loceol &&
6992 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6993 scan += UTF8SKIP(scan);
6997 while (scan < loceol && !isDIGIT(*scan))
7003 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7004 scan += UTF8SKIP(scan);
7008 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7014 PL_reg_flags |= RF_tainted;
7017 while (hardcount < max && scan < loceol &&
7018 !isDIGIT_LC_utf8((U8*)scan)) {
7019 scan += UTF8SKIP(scan);
7023 while (scan < loceol && !isDIGIT_LC(*scan))
7028 Perl_croak(aTHX_ "panic: regrepeat() should not be called with non-simple: LNBREAK");
7029 assert(0); /* NOTREACHED */
7033 while (hardcount < max && scan < loceol &&
7034 (c=is_HORIZWS_utf8_safe(scan, loceol)))
7040 while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7047 while (hardcount < max && scan < loceol &&
7048 !is_HORIZWS_utf8_safe(scan, loceol))
7050 scan += UTF8SKIP(scan);
7054 while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7062 while (hardcount < max && scan < loceol &&
7063 (c=is_VERTWS_utf8_safe(scan, loceol)))
7069 while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7077 while (hardcount < max && scan < loceol &&
7078 !is_VERTWS_utf8_safe(scan, loceol))
7080 scan += UTF8SKIP(scan);
7084 while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7090 default: /* Called on something of 0 width. */
7091 break; /* So match right here or not at all. */
7097 c = scan - *startposp;
7101 GET_RE_DEBUG_FLAGS_DECL;
7103 SV * const prop = sv_newmortal();
7104 regprop(prog, prop, p);
7105 PerlIO_printf(Perl_debug_log,
7106 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7107 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7115 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7117 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7118 create a copy so that changes the caller makes won't change the shared one.
7119 If <altsvp> is non-null, will return NULL in it, for back-compat.
7122 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7124 PERL_ARGS_ASSERT_REGCLASS_SWASH;
7130 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7135 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
7137 /* Returns the swash for the input 'node' in the regex 'prog'.
7138 * If <doinit> is true, will attempt to create the swash if not already
7140 * If <listsvp> is non-null, will return the swash initialization string in
7142 * Tied intimately to how regcomp.c sets up the data structure */
7149 RXi_GET_DECL(prog,progi);
7150 const struct reg_data * const data = prog ? progi->data : NULL;
7152 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7154 assert(ANYOF_NONBITMAP(node));
7156 if (data && data->count) {
7157 const U32 n = ARG(node);
7159 if (data->what[n] == 's') {
7160 SV * const rv = MUTABLE_SV(data->data[n]);
7161 AV * const av = MUTABLE_AV(SvRV(rv));
7162 SV **const ary = AvARRAY(av);
7163 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7165 si = *ary; /* ary[0] = the string to initialize the swash with */
7167 /* Elements 2 and 3 are either both present or both absent. [2] is
7168 * any inversion list generated at compile time; [3] indicates if
7169 * that inversion list has any user-defined properties in it. */
7170 if (av_len(av) >= 2) {
7173 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7180 /* Element [1] is reserved for the set-up swash. If already there,
7181 * return it; if not, create it and store it there */
7182 if (SvROK(ary[1])) {
7185 else if (si && doinit) {
7187 sw = _core_swash_init("utf8", /* the utf8 package */
7191 0, /* not from tr/// */
7194 (void)av_store(av, 1, sw);
7200 SV* matches_string = newSVpvn("", 0);
7202 /* Use the swash, if any, which has to have incorporated into it all
7204 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7205 && (si && si != &PL_sv_undef))
7208 /* If no swash, use the input initialization string, if available */
7209 sv_catsv(matches_string, si);
7212 /* Add the inversion list to whatever we have. This may have come from
7213 * the swash, or from an input parameter */
7215 sv_catsv(matches_string, _invlist_contents(invlist));
7217 *listsvp = matches_string;
7224 - reginclass - determine if a character falls into a character class
7226 n is the ANYOF regnode
7227 p is the target string
7228 lenp is pointer to the maximum number of bytes of how far to go in p
7229 (This is assumed wthout checking to always be at least the current
7231 utf8_target tells whether p is in UTF-8.
7233 Returns true if matched; false otherwise. If lenp is not NULL, on return
7234 from a successful match, the value it points to will be updated to how many
7235 bytes in p were matched. If there was no match, the value is undefined,
7236 possibly changed from the input.
7238 Note that this can be a synthetic start class, a combination of various
7239 nodes, so things you think might be mutually exclusive, such as locale,
7240 aren't. It can match both locale and non-locale
7245 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
7248 const char flags = ANYOF_FLAGS(n);
7254 PERL_ARGS_ASSERT_REGINCLASS;
7256 /* If c is not already the code point, get it */
7257 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
7258 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7259 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7260 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7261 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7262 * UTF8_ALLOW_FFFF */
7263 if (c_len == (STRLEN)-1)
7264 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7270 /* Use passed in max length, or one character if none passed in or less
7271 * than one character. And assume will match just one character. This is
7272 * overwritten later if matched more. */
7274 maxlen = (*lenp > c_len) ? *lenp : c_len;
7282 /* If this character is potentially in the bitmap, check it */
7284 if (ANYOF_BITMAP_TEST(n, c))
7286 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7292 else if (flags & ANYOF_LOCALE) {
7293 PL_reg_flags |= RF_tainted;
7295 if ((flags & ANYOF_LOC_FOLD)
7296 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7300 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7301 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
7302 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
7303 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
7304 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
7305 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
7306 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
7307 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
7308 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7309 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
7310 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
7311 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
7312 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
7313 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
7314 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
7315 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
7316 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
7317 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
7318 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
7319 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
7320 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
7321 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
7322 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
7323 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
7324 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
7325 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
7326 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
7327 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
7328 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
7329 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
7330 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
7331 ) /* How's that for a conditional? */
7338 /* If the bitmap didn't (or couldn't) match, and something outside the
7339 * bitmap could match, try that. Locale nodes specify completely the
7340 * behavior of code points in the bit map (otherwise, a utf8 target would
7341 * cause them to be treated as Unicode and not locale), except in
7342 * the very unlikely event when this node is a synthetic start class, which
7343 * could be a combination of locale and non-locale nodes. So allow locale
7344 * to match for the synthetic start class, which will give a false
7345 * positive that will be resolved when the match is done again as not part
7346 * of the synthetic start class */
7348 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7349 match = TRUE; /* Everything above 255 matches */
7351 else if (ANYOF_NONBITMAP(n)
7352 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7355 || (! (flags & ANYOF_LOCALE))
7356 || (flags & ANYOF_IS_SYNTHETIC)))))
7358 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7363 } else { /* Convert to utf8 */
7365 utf8_p = bytes_to_utf8(p, &len);
7368 if (swash_fetch(sw, utf8_p, TRUE)) {
7372 /* If we allocated a string above, free it */
7373 if (! utf8_target) Safefree(utf8_p);
7377 if (UNICODE_IS_SUPER(c)
7378 && (flags & ANYOF_WARN_SUPER)
7379 && ckWARN_d(WARN_NON_UNICODE))
7381 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7382 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7386 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7387 return cBOOL(flags & ANYOF_INVERT) ^ match;
7391 S_reghop3(U8 *s, I32 off, const U8* lim)
7393 /* return the position 'off' UTF-8 characters away from 's', forward if
7394 * 'off' >= 0, backwards if negative. But don't go outside of position
7395 * 'lim', which better be < s if off < 0 */
7399 PERL_ARGS_ASSERT_REGHOP3;
7402 while (off-- && s < lim) {
7403 /* XXX could check well-formedness here */
7408 while (off++ && s > lim) {
7410 if (UTF8_IS_CONTINUED(*s)) {
7411 while (s > lim && UTF8_IS_CONTINUATION(*s))
7414 /* XXX could check well-formedness here */
7421 /* there are a bunch of places where we use two reghop3's that should
7422 be replaced with this routine. but since thats not done yet
7423 we ifdef it out - dmq
7426 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7430 PERL_ARGS_ASSERT_REGHOP4;
7433 while (off-- && s < rlim) {
7434 /* XXX could check well-formedness here */
7439 while (off++ && s > llim) {
7441 if (UTF8_IS_CONTINUED(*s)) {
7442 while (s > llim && UTF8_IS_CONTINUATION(*s))
7445 /* XXX could check well-formedness here */
7453 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7457 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7460 while (off-- && s < lim) {
7461 /* XXX could check well-formedness here */
7468 while (off++ && s > lim) {
7470 if (UTF8_IS_CONTINUED(*s)) {
7471 while (s > lim && UTF8_IS_CONTINUATION(*s))
7474 /* XXX could check well-formedness here */
7483 restore_pos(pTHX_ void *arg)
7486 regexp * const rex = (regexp *)arg;
7487 if (PL_reg_state.re_state_eval_setup_done) {
7488 if (PL_reg_oldsaved) {
7489 rex->subbeg = PL_reg_oldsaved;
7490 rex->sublen = PL_reg_oldsavedlen;
7491 rex->suboffset = PL_reg_oldsavedoffset;
7492 rex->subcoffset = PL_reg_oldsavedcoffset;
7493 #ifdef PERL_OLD_COPY_ON_WRITE
7494 rex->saved_copy = PL_nrs;
7496 RXp_MATCH_COPIED_on(rex);
7498 PL_reg_magic->mg_len = PL_reg_oldpos;
7499 PL_reg_state.re_state_eval_setup_done = FALSE;
7500 PL_curpm = PL_reg_oldcurpm;
7505 S_to_utf8_substr(pTHX_ register regexp *prog)
7507 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7508 * on the converted value */
7512 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7515 if (prog->substrs->data[i].substr
7516 && !prog->substrs->data[i].utf8_substr) {
7517 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7518 prog->substrs->data[i].utf8_substr = sv;
7519 sv_utf8_upgrade(sv);
7520 if (SvVALID(prog->substrs->data[i].substr)) {
7521 if (SvTAIL(prog->substrs->data[i].substr)) {
7522 /* Trim the trailing \n that fbm_compile added last
7524 SvCUR_set(sv, SvCUR(sv) - 1);
7525 /* Whilst this makes the SV technically "invalid" (as its
7526 buffer is no longer followed by "\0") when fbm_compile()
7527 adds the "\n" back, a "\0" is restored. */
7528 fbm_compile(sv, FBMcf_TAIL);
7532 if (prog->substrs->data[i].substr == prog->check_substr)
7533 prog->check_utf8 = sv;
7539 S_to_byte_substr(pTHX_ register regexp *prog)
7541 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7542 * on the converted value; returns FALSE if can't be converted. */
7547 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7550 if (prog->substrs->data[i].utf8_substr
7551 && !prog->substrs->data[i].substr) {
7552 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7553 if (! sv_utf8_downgrade(sv, TRUE)) {
7556 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7557 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7558 /* Trim the trailing \n that fbm_compile added last
7560 SvCUR_set(sv, SvCUR(sv) - 1);
7561 fbm_compile(sv, FBMcf_TAIL);
7565 prog->substrs->data[i].substr = sv;
7566 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7567 prog->check_substr = sv;
7574 /* These constants are for finding GCB=LV and GCB=LVT. These are for the
7575 * pre-composed Hangul syllables, which are all in a contiguous block and
7576 * arranged there in such a way so as to facilitate alorithmic determination of
7577 * their characteristics. As such, they don't need a swash, but can be
7578 * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one
7580 #define SBASE 0xAC00 /* Start of block */
7581 #define SCount 11172 /* Length of block */
7584 #if 0 /* This routine is not currently used */
7585 PERL_STATIC_INLINE bool
7586 S_is_utf8_X_LV(pTHX_ const U8 *p)
7588 /* Unlike most other similarly named routines here, this does not create a
7589 * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7593 UV cp = valid_utf8_to_uvchr(p, NULL);
7595 PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7597 /* The earliest Unicode releases did not have these precomposed Hangul
7598 * syllables. Set to point to undef in that case, so will return false on
7600 if (! PL_utf8_X_LV) { /* Set up if this is the first time called */
7601 PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7602 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7603 SvREFCNT_dec(PL_utf8_X_LV);
7604 PL_utf8_X_LV = &PL_sv_undef;
7608 return (PL_utf8_X_LV != &PL_sv_undef
7609 && cp >= SBASE && cp < SBASE + SCount
7610 && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7614 PERL_STATIC_INLINE bool
7615 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7617 /* Unlike most other similarly named routines here, this does not create a
7618 * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7622 UV cp = valid_utf8_to_uvchr(p, NULL);
7624 PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7626 /* The earliest Unicode releases did not have these precomposed Hangul
7627 * syllables. Set to point to undef in that case, so will return false on
7629 if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */
7630 PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7631 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7632 SvREFCNT_dec(PL_utf8_X_LVT);
7633 PL_utf8_X_LVT = &PL_sv_undef;
7637 return (PL_utf8_X_LVT != &PL_sv_undef
7638 && cp >= SBASE && cp < SBASE + SCount
7639 && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7644 * c-indentation-style: bsd
7646 * indent-tabs-mode: nil
7649 * ex: set ts=8 sts=4 sw=4 et: